Skip to content

Commit

Permalink
ENH: support downcasting of nullable EAs in pd.to_numeric (#38746)
Browse files Browse the repository at this point in the history
  • Loading branch information
arw2019 authored Dec 30, 2020
1 parent f44f3ec commit 94810d1
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 0 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ Other enhancements
- Improved consistency of error message when passing an invalid ``win_type`` argument in :class:`Window` (:issue:`15969`)
- :func:`pandas.read_sql_query` now accepts a ``dtype`` argument to cast the columnar data from the SQL database based on user input (:issue:`10285`)
- Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`)
- :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`)

.. ---------------------------------------------------------------------------
Expand Down
35 changes: 35 additions & 0 deletions pandas/core/tools/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
ensure_object,
is_datetime_or_timedelta_dtype,
is_decimal,
is_integer_dtype,
is_number,
is_numeric_dtype,
is_scalar,
Expand All @@ -15,6 +16,7 @@
from pandas.core.dtypes.generic import ABCIndex, ABCSeries

import pandas as pd
from pandas.core.arrays.numeric import NumericArray


def to_numeric(arg, errors="raise", downcast=None):
Expand Down Expand Up @@ -108,6 +110,21 @@ def to_numeric(arg, errors="raise", downcast=None):
2 2.0
3 -3.0
dtype: float64
Downcasting of nullable integer and floating dtypes is supported:
>>> s = pd.Series([1, 2, 3], dtype="Int64")
>>> pd.to_numeric(s, downcast="integer")
0 1
1 2
2 3
dtype: Int8
>>> s = pd.Series([1.0, 2.1, 3.0], dtype="Float64")
>>> pd.to_numeric(s, downcast="float")
0 1.0
1 2.1
2 3.0
dtype: Float32
"""
if downcast not in (None, "integer", "signed", "unsigned", "float"):
raise ValueError("invalid downcasting method provided")
Expand Down Expand Up @@ -142,6 +159,14 @@ def to_numeric(arg, errors="raise", downcast=None):
else:
values = arg

# GH33013: for IntegerArray & FloatingArray extract non-null values for casting
# save mask to reconstruct the full array after casting
if isinstance(values, NumericArray):
mask = values._mask
values = values._data[~mask]
else:
mask = None

values_dtype = getattr(values, "dtype", None)
if is_numeric_dtype(values_dtype):
pass
Expand Down Expand Up @@ -188,6 +213,16 @@ def to_numeric(arg, errors="raise", downcast=None):
if values.dtype == dtype:
break

# GH33013: for IntegerArray & FloatingArray need to reconstruct masked array
if mask is not None:
data = np.zeros(mask.shape, dtype=values.dtype)
data[~mask] = values

from pandas.core.arrays import FloatingArray, IntegerArray

klass = IntegerArray if is_integer_dtype(data.dtype) else FloatingArray
values = klass(data, mask)

if is_series:
return arg._constructor(values, index=arg.index, name=arg.name)
elif is_index:
Expand Down
39 changes: 39 additions & 0 deletions pandas/tests/tools/test_to_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -725,3 +725,42 @@ def test_to_numeric_from_nullable_string(values, expected):
s = Series(values, dtype="string")
result = to_numeric(s)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"data, input_dtype, downcast, expected_dtype",
(
([1, 1], "Int64", "integer", "Int8"),
([1.0, pd.NA], "Float64", "integer", "Int8"),
([1.0, 1.1], "Float64", "integer", "Float64"),
([1, pd.NA], "Int64", "integer", "Int8"),
([450, 300], "Int64", "integer", "Int16"),
([1, 1], "Float64", "integer", "Int8"),
([np.iinfo(np.int64).max - 1, 1], "Int64", "integer", "Int64"),
([1, 1], "Int64", "signed", "Int8"),
([1.0, 1.0], "Float32", "signed", "Int8"),
([1.0, 1.1], "Float64", "signed", "Float64"),
([1, pd.NA], "Int64", "signed", "Int8"),
([450, -300], "Int64", "signed", "Int16"),
pytest.param(
[np.iinfo(np.uint64).max - 1, 1],
"UInt64",
"signed",
"UInt64",
marks=pytest.mark.xfail(reason="GH38798"),
),
([1, 1], "Int64", "unsigned", "UInt8"),
([1.0, 1.0], "Float32", "unsigned", "UInt8"),
([1.0, 1.1], "Float64", "unsigned", "Float64"),
([1, pd.NA], "Int64", "unsigned", "UInt8"),
([450, -300], "Int64", "unsigned", "Int64"),
([-1, -1], "Int32", "unsigned", "Int32"),
([1, 1], "Float64", "float", "Float32"),
([1, 1.1], "Float64", "float", "Float32"),
),
)
def test_downcast_nullable_numeric(data, input_dtype, downcast, expected_dtype):
arr = pd.array(data, dtype=input_dtype)
result = pd.to_numeric(arr, downcast=downcast)
expected = pd.array(data, dtype=expected_dtype)
tm.assert_extension_array_equal(result, expected)

0 comments on commit 94810d1

Please sign in to comment.