From 94810d121d6b547262a7659939aeff6b21016df7 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Wed, 30 Dec 2020 08:59:00 -0500 Subject: [PATCH] ENH: support downcasting of nullable EAs in pd.to_numeric (#38746) --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/core/tools/numeric.py | 35 ++++++++++++++++++++++++ pandas/tests/tools/test_to_numeric.py | 39 +++++++++++++++++++++++++++ 3 files changed, 75 insertions(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index d261f675f3749..ed4348d25f606 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -50,6 +50,7 @@ Other enhancements - Improved consistency of error message when passing an invalid ``win_type`` argument in :class:`Window` (:issue:`15969`) - :func:`pandas.read_sql_query` now accepts a ``dtype`` argument to cast the columnar data from the SQL database based on user input (:issue:`10285`) - Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`) +- :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 08cdfde7df58d..1389aba9525d3 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -7,6 +7,7 @@ ensure_object, is_datetime_or_timedelta_dtype, is_decimal, + is_integer_dtype, is_number, is_numeric_dtype, is_scalar, @@ -15,6 +16,7 @@ from pandas.core.dtypes.generic import ABCIndex, ABCSeries import pandas as pd +from pandas.core.arrays.numeric import NumericArray def to_numeric(arg, errors="raise", downcast=None): @@ -108,6 +110,21 @@ def to_numeric(arg, errors="raise", downcast=None): 2 2.0 3 -3.0 dtype: float64 + + Downcasting of nullable integer and floating dtypes is supported: + + >>> s = pd.Series([1, 2, 3], dtype="Int64") + >>> pd.to_numeric(s, downcast="integer") + 0 1 + 1 2 + 2 3 + dtype: Int8 + >>> s = pd.Series([1.0, 2.1, 3.0], dtype="Float64") + >>> pd.to_numeric(s, downcast="float") + 0 1.0 + 1 2.1 + 2 3.0 + dtype: Float32 """ if downcast not in (None, "integer", "signed", "unsigned", "float"): raise ValueError("invalid downcasting method provided") @@ -142,6 +159,14 @@ def to_numeric(arg, errors="raise", downcast=None): else: values = arg + # GH33013: for IntegerArray & FloatingArray extract non-null values for casting + # save mask to reconstruct the full array after casting + if isinstance(values, NumericArray): + mask = values._mask + values = values._data[~mask] + else: + mask = None + values_dtype = getattr(values, "dtype", None) if is_numeric_dtype(values_dtype): pass @@ -188,6 +213,16 @@ def to_numeric(arg, errors="raise", downcast=None): if values.dtype == dtype: break + # GH33013: for IntegerArray & FloatingArray need to reconstruct masked array + if mask is not None: + data = np.zeros(mask.shape, dtype=values.dtype) + data[~mask] = values + + from pandas.core.arrays import FloatingArray, IntegerArray + + klass = IntegerArray if is_integer_dtype(data.dtype) else FloatingArray + values = klass(data, mask) + if is_series: return arg._constructor(values, index=arg.index, name=arg.name) elif is_index: diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index f89958f7723ef..80446e464985c 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -725,3 +725,42 @@ def test_to_numeric_from_nullable_string(values, expected): s = Series(values, dtype="string") result = to_numeric(s) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "data, input_dtype, downcast, expected_dtype", + ( + ([1, 1], "Int64", "integer", "Int8"), + ([1.0, pd.NA], "Float64", "integer", "Int8"), + ([1.0, 1.1], "Float64", "integer", "Float64"), + ([1, pd.NA], "Int64", "integer", "Int8"), + ([450, 300], "Int64", "integer", "Int16"), + ([1, 1], "Float64", "integer", "Int8"), + ([np.iinfo(np.int64).max - 1, 1], "Int64", "integer", "Int64"), + ([1, 1], "Int64", "signed", "Int8"), + ([1.0, 1.0], "Float32", "signed", "Int8"), + ([1.0, 1.1], "Float64", "signed", "Float64"), + ([1, pd.NA], "Int64", "signed", "Int8"), + ([450, -300], "Int64", "signed", "Int16"), + pytest.param( + [np.iinfo(np.uint64).max - 1, 1], + "UInt64", + "signed", + "UInt64", + marks=pytest.mark.xfail(reason="GH38798"), + ), + ([1, 1], "Int64", "unsigned", "UInt8"), + ([1.0, 1.0], "Float32", "unsigned", "UInt8"), + ([1.0, 1.1], "Float64", "unsigned", "Float64"), + ([1, pd.NA], "Int64", "unsigned", "UInt8"), + ([450, -300], "Int64", "unsigned", "Int64"), + ([-1, -1], "Int32", "unsigned", "Int32"), + ([1, 1], "Float64", "float", "Float32"), + ([1, 1.1], "Float64", "float", "Float32"), + ), +) +def test_downcast_nullable_numeric(data, input_dtype, downcast, expected_dtype): + arr = pd.array(data, dtype=input_dtype) + result = pd.to_numeric(arr, downcast=downcast) + expected = pd.array(data, dtype=expected_dtype) + tm.assert_extension_array_equal(result, expected)