Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Better handle larger numbers in to_numeric #24956

Merged
merged 2 commits into from
Jan 31, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ Timezones
Numeric
^^^^^^^

- Bug in :meth:`to_numeric` in which large negative numbers were being improperly handled (:issue:`24910`)
- Bug in :meth:`to_numeric` in which numbers were being coerced to float, even though ``errors`` was not ``coerce`` (:issue:`24910`)
-
-
-
Expand Down
25 changes: 15 additions & 10 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1828,7 +1828,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
except (ValueError, OverflowError, TypeError):
pass

# otherwise, iterate and do full infererence
# Otherwise, iterate and do full inference.
cdef:
int status, maybe_int
Py_ssize_t i, n = values.size
Expand Down Expand Up @@ -1865,10 +1865,10 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
else:
seen.float_ = True

if val <= oINT64_MAX:
if oINT64_MIN <= val <= oINT64_MAX:
ints[i] = val

if seen.sint_ and seen.uint_:
if val < oINT64_MIN or (seen.sint_ and seen.uint_):
seen.float_ = True

elif util.is_bool_object(val):
Expand Down Expand Up @@ -1910,23 +1910,28 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
else:
seen.saw_int(as_int)

if not (seen.float_ or as_int in na_values):
if as_int not in na_values:
if as_int < oINT64_MIN or as_int > oUINT64_MAX:
raise ValueError('Integer out of range.')
if seen.coerce_numeric:
seen.float_ = True
else:
raise ValueError("Integer out of range.")
else:
if as_int >= 0:
uints[i] = as_int

if as_int >= 0:
uints[i] = as_int
if as_int <= oINT64_MAX:
ints[i] = as_int
if as_int <= oINT64_MAX:
ints[i] = as_int

seen.float_ = seen.float_ or (seen.uint_ and seen.sint_)
else:
seen.float_ = True
except (TypeError, ValueError) as e:
if not seen.coerce_numeric:
raise type(e)(str(e) + ' at position {pos}'.format(pos=i))
raise type(e)(str(e) + " at position {pos}".format(pos=i))
elif "uint64" in str(e): # Exception from check functions.
raise

seen.saw_null()
floats[i] = NaN

Expand Down
8 changes: 8 additions & 0 deletions pandas/core/tools/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,14 @@ def to_numeric(arg, errors='raise', downcast=None):
depending on the data supplied. Use the `downcast` parameter
to obtain other dtypes.

Please note that precision loss may occur if really large numbers
are passed in. Due to the internal limitations of `ndarray`, if
numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min)
or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are
passed in, it is very likely they will be converted to float so that
they can stored in an `ndarray`. These warnings apply similarly to
`Series` since it internally leverages `ndarray`.

Parameters
----------
arg : scalar, list, tuple, 1-d array, or Series
Expand Down
148 changes: 133 additions & 15 deletions pandas/tests/tools/test_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,50 @@
from numpy import iinfo
import pytest

import pandas.compat as compat

import pandas as pd
from pandas import DataFrame, Index, Series, to_numeric
from pandas.util import testing as tm


@pytest.fixture(params=[None, "ignore", "raise", "coerce"])
def errors(request):
return request.param


@pytest.fixture(params=[True, False])
def signed(request):
return request.param


@pytest.fixture(params=[lambda x: x, str], ids=["identity", "str"])
def transform(request):
return request.param


@pytest.fixture(params=[
47393996303418497800,
100000000000000000000
])
def large_val(request):
return request.param


@pytest.fixture(params=[True, False])
def multiple_elts(request):
return request.param


@pytest.fixture(params=[
(lambda x: Index(x, name="idx"), tm.assert_index_equal),
(lambda x: Series(x, name="ser"), tm.assert_series_equal),
(lambda x: np.array(Index(x).values), tm.assert_numpy_array_equal)
])
def transform_assert_equal(request):
return request.param


@pytest.mark.parametrize("input_kwargs,result_kwargs", [
(dict(), dict(dtype=np.int64)),
(dict(errors="coerce", downcast="integer"), dict(dtype=np.int8))
Expand Down Expand Up @@ -172,7 +211,6 @@ def test_all_nan():
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("errors", [None, "ignore", "raise", "coerce"])
def test_type_check(errors):
# see gh-11776
df = DataFrame({"a": [1, -3.14, 7], "b": ["4", "5", "6"]})
Expand All @@ -183,11 +221,100 @@ def test_type_check(errors):
to_numeric(df, **kwargs)


@pytest.mark.parametrize("val", [
1, 1.1, "1", "1.1", -1.5, "-1.5"
])
def test_scalar(val):
assert to_numeric(val) == float(val)
@pytest.mark.parametrize("val", [1, 1.1, 20001])
def test_scalar(val, signed, transform):
val = -val if signed else val
assert to_numeric(transform(val)) == float(val)


def test_really_large_scalar(large_val, signed, transform, errors):
# see gh-24910
kwargs = dict(errors=errors) if errors is not None else dict()
val = -large_val if signed else large_val

val = transform(val)
val_is_string = isinstance(val, str)

if val_is_string and errors in (None, "raise"):
msg = "Integer out of range. at position 0"
with pytest.raises(ValueError, match=msg):
to_numeric(val, **kwargs)
else:
expected = float(val) if (errors == "coerce" and
val_is_string) else val
assert tm.assert_almost_equal(to_numeric(val, **kwargs), expected)


def test_really_large_in_arr(large_val, signed, transform,
multiple_elts, errors):
# see gh-24910
kwargs = dict(errors=errors) if errors is not None else dict()
val = -large_val if signed else large_val
val = transform(val)

extra_elt = "string"
arr = [val] + multiple_elts * [extra_elt]

val_is_string = isinstance(val, str)
coercing = errors == "coerce"

if errors in (None, "raise") and (val_is_string or multiple_elts):
if val_is_string:
msg = "Integer out of range. at position 0"
else:
msg = 'Unable to parse string "string" at position 1'

with pytest.raises(ValueError, match=msg):
to_numeric(arr, **kwargs)
else:
result = to_numeric(arr, **kwargs)

exp_val = float(val) if (coercing and val_is_string) else val
expected = [exp_val]

if multiple_elts:
if coercing:
expected.append(np.nan)
exp_dtype = float
else:
expected.append(extra_elt)
exp_dtype = object
else:
exp_dtype = float if isinstance(exp_val, (
int, compat.long, float)) else object

tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))


def test_really_large_in_arr_consistent(large_val, signed,
multiple_elts, errors):
# see gh-24910
#
# Even if we discover that we have to hold float, does not mean
# we should be lenient on subsequent elements that fail to be integer.
kwargs = dict(errors=errors) if errors is not None else dict()
arr = [str(-large_val if signed else large_val)]

if multiple_elts:
arr.insert(0, large_val)

if errors in (None, "raise"):
index = int(multiple_elts)
msg = "Integer out of range. at position {index}".format(index=index)

with pytest.raises(ValueError, match=msg):
to_numeric(arr, **kwargs)
else:
result = to_numeric(arr, **kwargs)

if errors == "coerce":
expected = [float(i) for i in arr]
exp_dtype = float
else:
expected = arr
exp_dtype = object

tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))


@pytest.mark.parametrize("errors,checker", [
Expand All @@ -205,15 +332,6 @@ def test_scalar_fail(errors, checker):
assert checker(to_numeric(scalar, errors=errors))


@pytest.fixture(params=[
(lambda x: Index(x, name="idx"), tm.assert_index_equal),
(lambda x: Series(x, name="ser"), tm.assert_series_equal),
(lambda x: np.array(Index(x).values), tm.assert_numpy_array_equal)
])
def transform_assert_equal(request):
return request.param


@pytest.mark.parametrize("data", [
[1, 2, 3],
[1., np.nan, 3, np.nan]
Expand Down