ENH: support downcasting of nullable EAs in pd.to_numeric (#38746)

pandas-dev · Dec 30, 2020 · 94810d1 · 94810d1
1 parent f44f3ec
commit 94810d1
Show file tree

Hide file tree

Showing 3 changed files with 75 additions and 0 deletions.
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -50,6 +50,7 @@ Other enhancements
 - Improved consistency of error message when passing an invalid ``win_type`` argument in :class:`Window` (:issue:`15969`)
 - :func:`pandas.read_sql_query` now accepts a ``dtype`` argument to cast the columnar data from the SQL database based on user input (:issue:`10285`)
 - Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`)
+- :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`)
 
 .. ---------------------------------------------------------------------------
 

diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py
@@ -7,6 +7,7 @@
     ensure_object,
     is_datetime_or_timedelta_dtype,
     is_decimal,
+    is_integer_dtype,
     is_number,
     is_numeric_dtype,
     is_scalar,
@@ -15,6 +16,7 @@
 from pandas.core.dtypes.generic import ABCIndex, ABCSeries
 
 import pandas as pd
+from pandas.core.arrays.numeric import NumericArray
 
 
 def to_numeric(arg, errors="raise", downcast=None):
@@ -108,6 +110,21 @@ def to_numeric(arg, errors="raise", downcast=None):
     2    2.0
     3   -3.0
     dtype: float64
+
+    Downcasting of nullable integer and floating dtypes is supported:
+
+    >>> s = pd.Series([1, 2, 3], dtype="Int64")
+    >>> pd.to_numeric(s, downcast="integer")
+    0    1
+    1    2
+    2    3
+    dtype: Int8
+    >>> s = pd.Series([1.0, 2.1, 3.0], dtype="Float64")
+    >>> pd.to_numeric(s, downcast="float")
+    0    1.0
+    1    2.1
+    2    3.0
+    dtype: Float32
     """
     if downcast not in (None, "integer", "signed", "unsigned", "float"):
         raise ValueError("invalid downcasting method provided")
@@ -142,6 +159,14 @@ def to_numeric(arg, errors="raise", downcast=None):
     else:
         values = arg
 
+    # GH33013: for IntegerArray & FloatingArray extract non-null values for casting
+    # save mask to reconstruct the full array after casting
+    if isinstance(values, NumericArray):
+        mask = values._mask
+        values = values._data[~mask]
+    else:
+        mask = None
+
     values_dtype = getattr(values, "dtype", None)
     if is_numeric_dtype(values_dtype):
         pass
@@ -188,6 +213,16 @@ def to_numeric(arg, errors="raise", downcast=None):
                     if values.dtype == dtype:
                         break
 
+    # GH33013: for IntegerArray & FloatingArray need to reconstruct masked array
+    if mask is not None:
+        data = np.zeros(mask.shape, dtype=values.dtype)
+        data[~mask] = values
+
+        from pandas.core.arrays import FloatingArray, IntegerArray
+
+        klass = IntegerArray if is_integer_dtype(data.dtype) else FloatingArray
+        values = klass(data, mask)
+
     if is_series:
         return arg._constructor(values, index=arg.index, name=arg.name)
     elif is_index:

diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py
@@ -725,3 +725,42 @@ def test_to_numeric_from_nullable_string(values, expected):
     s = Series(values, dtype="string")
     result = to_numeric(s)
     tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "data, input_dtype, downcast, expected_dtype",
+    (
+        ([1, 1], "Int64", "integer", "Int8"),
+        ([1.0, pd.NA], "Float64", "integer", "Int8"),
+        ([1.0, 1.1], "Float64", "integer", "Float64"),
+        ([1, pd.NA], "Int64", "integer", "Int8"),
+        ([450, 300], "Int64", "integer", "Int16"),
+        ([1, 1], "Float64", "integer", "Int8"),
+        ([np.iinfo(np.int64).max - 1, 1], "Int64", "integer", "Int64"),
+        ([1, 1], "Int64", "signed", "Int8"),
+        ([1.0, 1.0], "Float32", "signed", "Int8"),
+        ([1.0, 1.1], "Float64", "signed", "Float64"),
+        ([1, pd.NA], "Int64", "signed", "Int8"),
+        ([450, -300], "Int64", "signed", "Int16"),
+        pytest.param(
+            [np.iinfo(np.uint64).max - 1, 1],
+            "UInt64",
+            "signed",
+            "UInt64",
+            marks=pytest.mark.xfail(reason="GH38798"),
+        ),
+        ([1, 1], "Int64", "unsigned", "UInt8"),
+        ([1.0, 1.0], "Float32", "unsigned", "UInt8"),
+        ([1.0, 1.1], "Float64", "unsigned", "Float64"),
+        ([1, pd.NA], "Int64", "unsigned", "UInt8"),
+        ([450, -300], "Int64", "unsigned", "Int64"),
+        ([-1, -1], "Int32", "unsigned", "Int32"),
+        ([1, 1], "Float64", "float", "Float32"),
+        ([1, 1.1], "Float64", "float", "Float32"),
+    ),
+)
+def test_downcast_nullable_numeric(data, input_dtype, downcast, expected_dtype):
+    arr = pd.array(data, dtype=input_dtype)
+    result = pd.to_numeric(arr, downcast=downcast)
+    expected = pd.array(data, dtype=expected_dtype)
+    tm.assert_extension_array_equal(result, expected)