diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 68c1839221508c..877c934b2a7b41 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -36,6 +36,7 @@ Datetimelike API Changes Other API Changes ^^^^^^^^^^^^^^^^^ +- Series and Index constructors now raise when the data is incompatible with the specified dtype (:issue:`15832`) - - - diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index ebc7a13234a986..5fa6662c407bb1 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -20,6 +20,7 @@ is_dtype_equal, is_float_dtype, is_complex_dtype, is_integer_dtype, + is_unsigned_integer_dtype, is_datetime_or_timedelta_dtype, is_bool_dtype, is_scalar, is_string_dtype, _string_dtypes, @@ -1269,3 +1270,64 @@ def construct_1d_ndarray_preserving_na(values, dtype=None, copy=False): subarr = subarr2 return subarr + + +def maybe_cast_to_integer_array(arr, dtype, copy=False): + """ + Takes any dtype and returns the casted version, raising for when data is + incompatible with integer/unsigned integer dtypes. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + arr : ndarray + The array to cast. + dtype : str, np.dtype + The integer dtype to cast the array to. + copy: boolean, default False + Whether to make a copy of the array before returning. + + Returns + ------- + int_arr : ndarray + An array of integer or unsigned integer dtype + + Raises + ------ + OverflowError : the dtype is incompatible with the data + ValueError : loss of precision has occurred during casting + + Examples + -------- + If you try to coerce negative values to unsigned integers, it raises: + + >>> Series([-1], dtype="uint64") + Traceback (most recent call last): + ... + OverflowError: Trying to coerce negative values to unsigned integers + + Also, if you try to coerce float values to integers, it raises: + + >>> Series([1, 2, 3.5], dtype="int64") + Traceback (most recent call last): + ... + ValueError: Trying to coerce float values to integers + """ + + try: + casted = arr.astype(dtype, copy=copy) + except OverflowError: + raise OverflowError("The elements provided in the data cannot all be " + "casted to the dtype {dtype}".format(dtype=dtype)) + + if np.array_equal(arr, casted): + return casted + + if is_unsigned_integer_dtype(dtype) and (arr < 0).any(): + raise OverflowError("Trying to coerce negative values " + "to unsigned integers") + + if is_integer_dtype(dtype) and (is_float_dtype(arr) or + is_object_dtype(arr)): + raise ValueError("Trying to coerce float values to integers") diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bf1051332ee197..adccef2c2dd992 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -21,6 +21,7 @@ ABCPeriodIndex, ABCTimedeltaIndex, ABCDateOffset) from pandas.core.dtypes.missing import isna, array_equivalent +from pandas.core.dtypes.cast import maybe_cast_to_integer_array from pandas.core.dtypes.common import ( _ensure_int64, _ensure_object, @@ -309,19 +310,16 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, if is_integer_dtype(dtype): inferred = lib.infer_dtype(data) if inferred == 'integer': - try: - data = np.array(data, copy=copy, dtype=dtype) - except OverflowError: - # gh-15823: a more user-friendly error message - raise OverflowError( - "the elements provided in the data cannot " - "all be casted to the dtype {dtype}" - .format(dtype=dtype)) + data = maybe_cast_to_integer_array(data, dtype, + copy=copy) elif inferred in ['floating', 'mixed-integer-float']: if isna(data).any(): raise ValueError('cannot convert float ' 'NaN to integer') + if inferred == "mixed-integer-float": + maybe_cast_to_integer_array(data, dtype) + # If we are actually all equal to integers, # then coerce to integer. try: @@ -350,7 +348,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, except (TypeError, ValueError) as e: msg = str(e) - if 'cannot convert float' in msg: + if ("cannot convert float" in msg or + "Trying to coerce float values to integer" in msg): raise # maybe coerce to a sub-class diff --git a/pandas/core/series.py b/pandas/core/series.py index 0450f28087f667..8adce79e486781 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -41,7 +41,8 @@ maybe_cast_to_datetime, maybe_castable, construct_1d_arraylike_from_scalar, construct_1d_ndarray_preserving_na, - construct_1d_object_array_from_listlike) + construct_1d_object_array_from_listlike, + maybe_cast_to_integer_array) from pandas.core.dtypes.missing import ( isna, notna, @@ -4067,6 +4068,9 @@ def _try_cast(arr, take_fast_path): return arr try: + if is_float_dtype(dtype) or is_integer_dtype(dtype): + subarr = maybe_cast_to_integer_array(np.asarray(arr), dtype) + subarr = maybe_cast_to_datetime(arr, dtype) # Take care in creating object arrays (but iterators are not # supported): diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 311c71f734945d..533bff0384ad91 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -199,11 +199,11 @@ def test_downcast(self): self._compare(result, expected) def test_constructor_compound_dtypes(self): - # GH 5191 - # compound dtypes should raise not-implementederror + # see gh-5191 + # Compound dtypes should raise NotImplementedError. def f(dtype): - return self._construct(shape=3, dtype=dtype) + return self._construct(shape=3, value=1, dtype=dtype) pytest.raises(NotImplementedError, f, [("A", "datetime64[h]"), ("B", "str"), @@ -534,14 +534,14 @@ def test_truncate_out_of_bounds(self): # small shape = [int(2e3)] + ([1] * (self._ndim - 1)) - small = self._construct(shape, dtype='int8') + small = self._construct(shape, dtype='int8', value=1) self._compare(small.truncate(), small) self._compare(small.truncate(before=0, after=3e3), small) self._compare(small.truncate(before=-1, after=2e3), small) # big shape = [int(2e6)] + ([1] * (self._ndim - 1)) - big = self._construct(shape, dtype='int8') + big = self._construct(shape, dtype='int8', value=1) self._compare(big.truncate(), big) self._compare(big.truncate(before=0, after=3e6), big) self._compare(big.truncate(before=-1, after=2e6), big) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index c264f5f79e47e1..29e57d6f671ed4 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -476,7 +476,7 @@ def test_constructor_nonhashable_name(self, indices): def test_constructor_overflow_int64(self): # see gh-15832 - msg = ("the elements provided in the data cannot " + msg = ("The elements provided in the data cannot " "all be casted to the dtype int64") with tm.assert_raises_regex(OverflowError, msg): Index([np.iinfo(np.uint64).max - 1], dtype="int64") diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 49322d9b7abd69..81d108db83d5b6 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -451,6 +451,20 @@ def test_astype(self): i = Float64Index([0, 1.1, np.NAN]) pytest.raises(ValueError, lambda: i.astype(dtype)) + @pytest.mark.parametrize("int_dtype", ["uint8", "uint16", "uint32", + "uint64", "int32", "int64", + "int16", "int8"]) + @pytest.mark.parametrize("float_dtype", ["float16", "float32", "float64"]) + def test_type_coercion(self, int_dtype, float_dtype): + + # see gh-15832 + msg = "Trying to coerce float values to integers" + with tm.assert_raises_regex(ValueError, msg): + Index([1, 2, 3.5], dtype=int_dtype) + + i = Index([1, 2, 3.5], dtype=float_dtype) + tm.assert_index_equal(i, Index([1, 2, 3.5])) + def test_equals_numeric(self): i = Float64Index([1.0, 2.0]) @@ -862,6 +876,16 @@ def test_constructor_corner(self): with tm.assert_raises_regex(TypeError, 'casting'): Int64Index(arr_with_floats) + @pytest.mark.parametrize("uint_dtype", ["uint8", "uint16", + "uint32", "uint64"]) + def test_constructor_coercion_signed_to_unsigned(self, uint_dtype): + + # see gh-15832 + msg = "Trying to coerce negative values to unsigned integers" + + with tm.assert_raises_regex(OverflowError, msg): + Index([-1], dtype=uint_dtype) + def test_coerce_list(self): # coerce things arr = Index([1, 2, 3, 4]) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 906d2aacd5586f..424a2f3050956c 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -542,12 +542,35 @@ def test_constructor_pass_nan_nat(self): tm.assert_series_equal(Series(np.array([np.nan, pd.NaT])), exp) def test_constructor_cast(self): - pytest.raises(ValueError, Series, ['a', 'b', 'c'], dtype=float) + msg = "could not convert string to float" + with tm.assert_raises_regex(ValueError, msg): + Series(["a", "b", "c"], dtype=float) + + @pytest.mark.parametrize("uint_dtype", ["uint8", "uint16", + "uint32", "uint64"]) + def test_constructor_unsigned_dtype_overflow(self, uint_dtype): + # see gh-15832 + msg = 'Trying to coerce negative values to unsigned integers' + with tm.assert_raises_regex(OverflowError, msg): + Series([-1], dtype=uint_dtype) + + @pytest.mark.parametrize("int_dtype", ["uint8", "uint16", "uint32", + "uint64", "int32", "int64", + "int16", "int8"]) + @pytest.mark.parametrize("float_dtype", ["float16", "float32", "float64"]) + def test_constructor_coerce_float_fail(self, int_dtype, float_dtype): + # see gh-15832 + msg = "Trying to coerce float values to integers" + with tm.assert_raises_regex(ValueError, msg): + Series([1, 2, 3.5], dtype=int_dtype) + + s = Series([1, 2, 3.5], dtype=float_dtype) + expected = Series([1, 2, 3.5]).astype(float_dtype) + assert_series_equal(s, expected) - def test_constructor_dtype_nocast(self): - # 1572 + def test_constructor_dtype_no_cast(self): + # see gh-1572 s = Series([1, 2, 3]) - s2 = Series(s, dtype=np.int64) s2[1] = 5