From 4efe6560e07f28de6a1834fa90e31cef31b0fb18 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Fri, 21 Jul 2017 07:05:19 -0400 Subject: [PATCH] BUG/API: dtype inconsistencies in .where / .setitem / .putmask / .fillna (#16821) * CLN/BUG: fix ndarray assignment may cause unexpected cast supersedes #14145 closes #14001 * API: This fixes a number of inconsistencies and API issues w.r.t. dtype conversions. This is a reprise of #14145 & #16408. This removes some code from the core structures & pushes it to internals, where the primitives are made more consistent. This should all us to be a bit more consistent for pandas2 type things. closes #16402 supersedes #14145 closes #14001 CLN: remove uneeded code in internals; use split_and_operate when possible --- doc/source/whatsnew/v0.21.0.txt | 62 +++ pandas/_libs/index.pyx | 26 +- pandas/_libs/tslib.pyx | 3 +- pandas/core/algorithms.py | 6 + pandas/core/dtypes/cast.py | 74 ++- pandas/core/dtypes/common.py | 13 +- pandas/core/frame.py | 70 +-- pandas/core/generic.py | 46 +- pandas/core/indexes/base.py | 12 +- pandas/core/indexes/numeric.py | 21 +- pandas/core/internals.py | 698 +++++++++++++++---------- pandas/core/ops.py | 18 +- pandas/core/panel.py | 21 +- pandas/core/sparse/frame.py | 11 +- pandas/tests/dtypes/test_cast.py | 109 +++- pandas/tests/dtypes/test_common.py | 2 + pandas/tests/dtypes/test_convert.py | 0 pandas/tests/dtypes/test_missing.py | 19 + pandas/tests/frame/test_indexing.py | 30 +- pandas/tests/frame/test_operators.py | 21 +- pandas/tests/indexing/test_coercion.py | 221 +++++--- pandas/tests/indexing/test_datetime.py | 10 +- pandas/tests/indexing/test_indexing.py | 6 + pandas/tests/series/test_analytics.py | 8 +- pandas/tests/series/test_indexing.py | 19 +- pandas/tests/series/test_missing.py | 48 +- 26 files changed, 1022 insertions(+), 552 deletions(-) delete mode 100644 pandas/tests/dtypes/test_convert.py diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index e9b00d34236e7..91d3e9e7b935b 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -127,6 +127,65 @@ the target. Now, a ``ValueError`` will be raised when such an input is passed in ... ValueError: Cannot operate inplace if there is no assignment +.. _whatsnew_0210.dtype_conversions: + +Dtype Conversions +^^^^^^^^^^^^^^^^^ + +- Previously assignments, ``.where()`` and ``.fillna()`` with a ``bool`` assignment, would coerce to + same type (e.g. int / float), or raise for datetimelikes. These will now preseve the bools with ``object`` dtypes. (:issue:`16821`). + + .. ipython:: python + + s = Series([1, 2, 3]) + + .. code-block:: python + + In [5]: s[1] = True + + In [6]: s + Out[6]: + 0 1 + 1 1 + 2 3 + dtype: int64 + + New Behavior + + .. ipython:: python + + s[1] = True + s + +- Previously as assignment to a datetimelike with a non-datetimelike would coerce the + non-datetime-like item being assigned (:issue:`14145`). + + .. ipython:: python + + s = pd.Series([pd.Timestamp('2011-01-01'), pd.Timestamp('2012-01-01')]) + + .. code-block:: python + + In [1]: s[1] = 1 + + In [2]: s + Out[2]: + 0 2011-01-01 00:00:00.000000000 + 1 1970-01-01 00:00:00.000000001 + dtype: datetime64[ns] + + These now coerce to ``object`` dtype. + + .. ipython:: python + + s[1] = 1 + s + +- Additional bug fixes w.r.t. dtype conversions. + + - Inconsistent behavior in ``.where()`` with datetimelikes which would raise rather than coerce to ``object`` (:issue:`16402`) + - Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`) + .. _whatsnew_0210.api: Other API Changes @@ -185,6 +244,9 @@ Bug Fixes Conversion ^^^^^^^^^^ +- Bug in assignment against datetime-like data with ``int`` may incorrectly converte to datetime-like (:issue:`14145`) +- Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`) + Indexing ^^^^^^^^ diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 5e92c506b5d0c..273dc06886088 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -19,6 +19,7 @@ cimport tslib from hashtable cimport * from pandas._libs import tslib, algos, hashtable as _hash from pandas._libs.tslib import Timestamp, Timedelta +from datetime import datetime, timedelta from datetime cimport (get_datetime64_value, _pydatetime_to_dts, pandas_datetimestruct) @@ -507,24 +508,37 @@ cdef class TimedeltaEngine(DatetimeEngine): return 'm8[ns]' cpdef convert_scalar(ndarray arr, object value): + # we don't turn integers + # into datetimes/timedeltas + + # we don't turn bools into int/float/complex + if arr.descr.type_num == NPY_DATETIME: if isinstance(value, np.ndarray): pass - elif isinstance(value, Timestamp): - return value.value + elif isinstance(value, datetime): + return Timestamp(value).value elif value is None or value != value: return iNaT - else: + elif util.is_string_object(value): return Timestamp(value).value + raise ValueError("cannot set a Timestamp with a non-timestamp") + elif arr.descr.type_num == NPY_TIMEDELTA: if isinstance(value, np.ndarray): pass - elif isinstance(value, Timedelta): - return value.value + elif isinstance(value, timedelta): + return Timedelta(value).value elif value is None or value != value: return iNaT - else: + elif util.is_string_object(value): return Timedelta(value).value + raise ValueError("cannot set a Timedelta with a non-timedelta") + + if (issubclass(arr.dtype.type, (np.integer, np.floating, np.complex)) and + not issubclass(arr.dtype.type, np.bool_)): + if util.is_bool_object(value): + raise ValueError('Cannot assign bool to float/integer series') if issubclass(arr.dtype.type, (np.integer, np.bool_)): if util.is_float_object(value) and value != value: diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index c471d46262484..44be9ba56b84a 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -14,6 +14,7 @@ cdef bint PY3 = (sys.version_info[0] >= 3) from cpython cimport ( PyTypeObject, PyFloat_Check, + PyComplex_Check, PyLong_Check, PyObject_RichCompareBool, PyObject_RichCompare, @@ -902,7 +903,7 @@ cdef inline bint _checknull_with_nat(object val): cdef inline bint _check_all_nulls(object val): """ utility to check if a value is any type of null """ cdef bint res - if PyFloat_Check(val): + if PyFloat_Check(val) or PyComplex_Check(val): res = val != val elif val is NaT: res = 1 diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 3ccd7216fa81a..133e9d7dca18f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -150,6 +150,12 @@ def _reconstruct_data(values, dtype, original): pass elif is_datetime64tz_dtype(dtype) or is_period_dtype(dtype): values = Index(original)._shallow_copy(values, name=None) + elif is_bool_dtype(dtype): + values = values.astype(dtype) + + # we only support object dtypes bool Index + if isinstance(original, Index): + values = values.astype(object) elif dtype is not None: values = values.astype(dtype) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 6532e17695c86..22d98a89d68d6 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -273,7 +273,7 @@ def maybe_promote(dtype, fill_value=np.nan): else: if issubclass(dtype.type, np.datetime64): try: - fill_value = lib.Timestamp(fill_value).value + fill_value = tslib.Timestamp(fill_value).value except: # the proper thing to do here would probably be to upcast # to object (but numpy 1.6.1 doesn't do this properly) @@ -334,6 +334,23 @@ def maybe_promote(dtype, fill_value=np.nan): return dtype, fill_value +def infer_dtype_from(val, pandas_dtype=False): + """ + interpret the dtype from a scalar or array. This is a convenience + routines to infer dtype from a scalar or an array + + Parameters + ---------- + pandas_dtype : bool, default False + whether to infer dtype including pandas extension types. + If False, scalar/array belongs to pandas extension types is inferred as + object + """ + if is_scalar(val): + return infer_dtype_from_scalar(val, pandas_dtype=pandas_dtype) + return infer_dtype_from_array(val, pandas_dtype=pandas_dtype) + + def infer_dtype_from_scalar(val, pandas_dtype=False): """ interpret the dtype from a scalar @@ -350,9 +367,9 @@ def infer_dtype_from_scalar(val, pandas_dtype=False): # a 1-element ndarray if isinstance(val, np.ndarray): + msg = "invalid ndarray passed to _infer_dtype_from_scalar" if val.ndim != 0: - raise ValueError( - "invalid ndarray passed to _infer_dtype_from_scalar") + raise ValueError(msg) dtype = val.dtype val = val.item() @@ -409,24 +426,31 @@ def infer_dtype_from_scalar(val, pandas_dtype=False): return dtype, val -def infer_dtype_from_array(arr): +def infer_dtype_from_array(arr, pandas_dtype=False): """ infer the dtype from a scalar or array Parameters ---------- arr : scalar or array + pandas_dtype : bool, default False + whether to infer dtype including pandas extension types. + If False, array belongs to pandas extension types + is inferred as object Returns ------- - tuple (numpy-compat dtype, array) + tuple (numpy-compat/pandas-compat dtype, array) Notes ----- - These infer to numpy dtypes exactly - with the exception that mixed / object dtypes + if pandas_dtype=False. these infer to numpy dtypes + exactly with the exception that mixed / object dtypes are not coerced by stringifying or conversion + if pandas_dtype=True. datetime64tz-aware/categorical + types will retain there character. + Examples -------- >>> np.asarray([1, '1']) @@ -443,6 +467,12 @@ def infer_dtype_from_array(arr): if not is_list_like(arr): arr = [arr] + if pandas_dtype and is_extension_type(arr): + return arr.dtype, arr + + elif isinstance(arr, ABCSeries): + return arr.dtype, np.asarray(arr) + # don't force numpy coerce with nan's inferred = lib.infer_dtype(arr) if inferred in ['string', 'bytes', 'unicode', @@ -553,7 +583,7 @@ def conv(r, dtype): if isnull(r): pass elif dtype == _NS_DTYPE: - r = lib.Timestamp(r) + r = tslib.Timestamp(r) elif dtype == _TD_DTYPE: r = _coerce_scalar_to_timedelta_type(r) elif dtype == np.bool_: @@ -1027,3 +1057,31 @@ def find_common_type(types): return np.object return np.find_common_type(types, []) + + +def cast_scalar_to_array(shape, value, dtype=None): + """ + create np.ndarray of specified shape and dtype, filled with values + + Parameters + ---------- + shape : tuple + value : scalar value + dtype : np.dtype, optional + dtype to coerce + + Returns + ------- + ndarray of shape, filled with value, of specified / inferred dtype + + """ + + if dtype is None: + dtype, fill_value = infer_dtype_from_scalar(value) + else: + fill_value = value + + values = np.empty(shape, dtype=dtype) + values.fill(fill_value) + + return values diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 114900ce802be..37f99bd344e6c 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -11,7 +11,8 @@ ExtensionDtype) from .generic import (ABCCategorical, ABCPeriodIndex, ABCDatetimeIndex, ABCSeries, - ABCSparseArray, ABCSparseSeries, ABCCategoricalIndex) + ABCSparseArray, ABCSparseSeries, ABCCategoricalIndex, + ABCIndexClass) from .inference import is_string_like from .inference import * # noqa @@ -1545,6 +1546,16 @@ def is_bool_dtype(arr_or_dtype): except ValueError: # this isn't even a dtype return False + + if isinstance(arr_or_dtype, ABCIndexClass): + + # TODO(jreback) + # we don't have a boolean Index class + # so its object, we need to infer to + # guess this + return (arr_or_dtype.is_object and + arr_or_dtype.inferred_type == 'boolean') + return issubclass(tipo, np.bool_) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4843f3389bf75..9514ab8f3b27f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -25,7 +25,8 @@ import numpy.ma as ma from pandas.core.dtypes.cast import ( - maybe_upcast, infer_dtype_from_scalar, + maybe_upcast, + cast_scalar_to_array, maybe_cast_to_datetime, maybe_infer_to_datetimelike, maybe_convert_platform, @@ -59,6 +60,7 @@ is_named_tuple) from pandas.core.dtypes.missing import isnull, notnull + from pandas.core.common import (_try_sort, _default_index, _values_from_object, @@ -385,15 +387,10 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, raise_with_traceback(exc) if arr.ndim == 0 and index is not None and columns is not None: - if isinstance(data, compat.string_types) and dtype is None: - dtype = np.object_ - if dtype is None: - dtype, data = infer_dtype_from_scalar(data) - - values = np.empty((len(index), len(columns)), dtype=dtype) - values.fill(data) - mgr = self._init_ndarray(values, index, columns, dtype=dtype, - copy=False) + values = cast_scalar_to_array((len(index), len(columns)), + data, dtype=dtype) + mgr = self._init_ndarray(values, index, columns, + dtype=values.dtype, copy=False) else: raise ValueError('DataFrame constructor not properly called!') @@ -507,7 +504,7 @@ def _get_axes(N, K, index=index, columns=columns): values = _prep_ndarray(values, copy=copy) if dtype is not None: - if values.dtype != dtype: + if not is_dtype_equal(values.dtype, dtype): try: values = values.astype(dtype) except Exception as orig: @@ -2689,9 +2686,8 @@ def reindexer(value): else: # upcast the scalar - dtype, value = infer_dtype_from_scalar(value) - value = np.repeat(value, len(self.index)).astype(dtype) - value = maybe_cast_to_datetime(value, dtype) + value = cast_scalar_to_array(len(self.index), value) + value = maybe_cast_to_datetime(value, value.dtype) # return internal types directly if is_extension_type(value): @@ -3676,7 +3672,8 @@ def reorder_levels(self, order, axis=0): # ---------------------------------------------------------------------- # Arithmetic / combination related - def _combine_frame(self, other, func, fill_value=None, level=None): + def _combine_frame(self, other, func, fill_value=None, level=None, + try_cast=True): this, other = self.align(other, join='outer', level=level, copy=False) new_index, new_columns = this.index, this.columns @@ -3729,19 +3726,23 @@ def f(i): copy=False) def _combine_series(self, other, func, fill_value=None, axis=None, - level=None): + level=None, try_cast=True): if axis is not None: axis = self._get_axis_name(axis) if axis == 'index': return self._combine_match_index(other, func, level=level, - fill_value=fill_value) + fill_value=fill_value, + try_cast=try_cast) else: return self._combine_match_columns(other, func, level=level, - fill_value=fill_value) + fill_value=fill_value, + try_cast=try_cast) return self._combine_series_infer(other, func, level=level, - fill_value=fill_value) + fill_value=fill_value, + try_cast=try_cast) - def _combine_series_infer(self, other, func, level=None, fill_value=None): + def _combine_series_infer(self, other, func, level=None, + fill_value=None, try_cast=True): if len(other) == 0: return self * NA @@ -3751,9 +3752,11 @@ def _combine_series_infer(self, other, func, level=None, fill_value=None): columns=self.columns) return self._combine_match_columns(other, func, level=level, - fill_value=fill_value) + fill_value=fill_value, + try_cast=try_cast) - def _combine_match_index(self, other, func, level=None, fill_value=None): + def _combine_match_index(self, other, func, level=None, + fill_value=None, try_cast=True): left, right = self.align(other, join='outer', axis=0, level=level, copy=False) if fill_value is not None: @@ -3763,7 +3766,8 @@ def _combine_match_index(self, other, func, level=None, fill_value=None): index=left.index, columns=self.columns, copy=False) - def _combine_match_columns(self, other, func, level=None, fill_value=None): + def _combine_match_columns(self, other, func, level=None, + fill_value=None, try_cast=True): left, right = self.align(other, join='outer', axis=1, level=level, copy=False) if fill_value is not None: @@ -3771,15 +3775,17 @@ def _combine_match_columns(self, other, func, level=None, fill_value=None): fill_value) new_data = left._data.eval(func=func, other=right, - axes=[left.columns, self.index]) + axes=[left.columns, self.index], + try_cast=try_cast) return self._constructor(new_data) - def _combine_const(self, other, func, raise_on_error=True): + def _combine_const(self, other, func, raise_on_error=True, try_cast=True): new_data = self._data.eval(func=func, other=other, - raise_on_error=raise_on_error) + raise_on_error=raise_on_error, + try_cast=try_cast) return self._constructor(new_data) - def _compare_frame_evaluate(self, other, func, str_rep): + def _compare_frame_evaluate(self, other, func, str_rep, try_cast=True): # unique if self.columns.is_unique: @@ -3803,16 +3809,18 @@ def _compare(a, b): result.columns = self.columns return result - def _compare_frame(self, other, func, str_rep): + def _compare_frame(self, other, func, str_rep, try_cast=True): if not self._indexed_same(other): raise ValueError('Can only compare identically-labeled ' 'DataFrame objects') - return self._compare_frame_evaluate(other, func, str_rep) + return self._compare_frame_evaluate(other, func, str_rep, + try_cast=try_cast) - def _flex_compare_frame(self, other, func, str_rep, level): + def _flex_compare_frame(self, other, func, str_rep, level, try_cast=True): if not self._indexed_same(other): self, other = self.align(other, 'outer', level=level, copy=False) - return self._compare_frame_evaluate(other, func, str_rep) + return self._compare_frame_evaluate(other, func, str_rep, + try_cast=try_cast) def combine(self, other, func, fill_value=None, overwrite=True): """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index b2083a4454f84..68416d85ca659 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -13,7 +13,6 @@ from pandas.core.dtypes.common import ( _ensure_int64, _ensure_object, - needs_i8_conversion, is_scalar, is_number, is_integer, is_bool, @@ -26,7 +25,8 @@ is_dict_like, is_re_compilable, pandas_dtype) -from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask +from pandas.core.dtypes.cast import ( + maybe_promote, maybe_upcast_putmask) from pandas.core.dtypes.missing import isnull, notnull from pandas.core.dtypes.generic import ABCSeries, ABCPanel @@ -5465,48 +5465,6 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, raise NotImplementedError("cannot align with a higher " "dimensional NDFrame") - elif is_list_like(other): - - if self.ndim == 1: - - # try to set the same dtype as ourselves - try: - new_other = np.array(other, dtype=self.dtype) - except ValueError: - new_other = np.array(other) - except TypeError: - new_other = other - - # we can end up comparing integers and m8[ns] - # which is a numpy no no - is_i8 = needs_i8_conversion(self.dtype) - if is_i8: - matches = False - else: - matches = (new_other == np.array(other)) - - if matches is False or not matches.all(): - - # coerce other to a common dtype if we can - if needs_i8_conversion(self.dtype): - try: - other = np.array(other, dtype=self.dtype) - except: - other = np.array(other) - else: - other = np.asarray(other) - other = np.asarray(other, - dtype=np.common_type(other, - new_other)) - - # we need to use the new dtype - try_quick = False - else: - other = new_other - else: - - other = np.array(other) - if isinstance(other, np.ndarray): if other.shape != self.shape: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c95a9598604ee..714b952217c9d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -26,6 +26,7 @@ is_object_dtype, is_categorical_dtype, is_interval_dtype, + is_bool, is_bool_dtype, is_signed_integer_dtype, is_unsigned_integer_dtype, @@ -611,9 +612,18 @@ def repeat(self, repeats, *args, **kwargs): def where(self, cond, other=None): if other is None: other = self._na_value - values = np.where(cond, self.values, other) dtype = self.dtype + values = self.values + + if is_bool(other) or is_bool_dtype(other): + + # bools force casting + values = values.astype(object) + dtype = None + + values = np.where(cond, values, other) + if self._is_numeric_dtype and np.any(isnull(values)): # We can't coerce to the numeric dtype of "self" (unless # it's float) if there are NaN values in our output. diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 72d521cbe2d60..142e0f36c66ec 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -2,9 +2,14 @@ from pandas._libs import (index as libindex, algos as libalgos, join as libjoin) from pandas.core.dtypes.common import ( - is_dtype_equal, pandas_dtype, - is_float_dtype, is_object_dtype, - is_integer_dtype, is_scalar) + is_dtype_equal, + pandas_dtype, + is_float_dtype, + is_object_dtype, + is_integer_dtype, + is_bool, + is_bool_dtype, + is_scalar) from pandas.core.common import _asarray_tuplesafe, _values_from_object from pandas import compat @@ -56,6 +61,16 @@ def _maybe_cast_slice_bound(self, label, side, kind): # we will try to coerce to integers return self._maybe_cast_indexer(label) + def _convert_for_op(self, value): + """ Convert value to be insertable to ndarray """ + + if is_bool(value) or is_bool_dtype(value): + # force conversion to object + # so we don't lose the bools + raise TypeError + + return value + def _convert_tolerance(self, tolerance): try: return float(tolerance) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index f2a7ac76481d4..8f3667edf68e6 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1,4 +1,5 @@ import copy +from warnings import catch_warnings import itertools import re import operator @@ -22,6 +23,7 @@ is_categorical, is_categorical_dtype, is_integer_dtype, is_datetime64tz_dtype, + is_bool_dtype, is_object_dtype, is_datetimelike_v_numeric, is_float_dtype, is_numeric_dtype, @@ -33,21 +35,21 @@ _get_dtype) from pandas.core.dtypes.cast import ( maybe_downcast_to_dtype, - maybe_convert_string_to_object, maybe_upcast, - maybe_convert_scalar, maybe_promote, + maybe_promote, + infer_dtype_from, infer_dtype_from_scalar, soft_convert_objects, maybe_convert_objects, astype_nansafe, find_common_type) from pandas.core.dtypes.missing import ( - isnull, array_equivalent, + isnull, notnull, array_equivalent, _is_na_compat, is_null_datelike_scalar) import pandas.core.dtypes.concat as _concat -from pandas.core.dtypes.generic import ABCSeries +from pandas.core.dtypes.generic import ABCSeries, ABCDatetimeIndex from pandas.core.common import is_null_slice import pandas.core.algorithms as algos @@ -169,11 +171,6 @@ def get_values(self, dtype=None): def to_dense(self): return self.values.view() - def to_object_block(self, mgr): - """ return myself as an object block """ - values = self.get_values(dtype=object) - return self.make_block(values, klass=ObjectBlock) - @property def _na_value(self): return np.nan @@ -374,7 +371,6 @@ def fillna(self, value, limit=None, inplace=False, downcast=None, else: return self.copy() - original_value = value mask = isnull(self.values) if limit is not None: if not is_integer(limit): @@ -388,7 +384,7 @@ def fillna(self, value, limit=None, inplace=False, downcast=None, # fillna, but if we cannot coerce, then try again as an ObjectBlock try: - values, _, value, _ = self._try_coerce_args(self.values, value) + values, _, _, _ = self._try_coerce_args(self.values, value) blocks = self.putmask(mask, value, inplace=inplace) blocks = [b.make_block(values=self._try_coerce_result(b.values)) for b in blocks] @@ -399,12 +395,82 @@ def fillna(self, value, limit=None, inplace=False, downcast=None, if not mask.any(): return self if inplace else self.copy() - # we cannot coerce the underlying object, so - # make an ObjectBlock - return self.to_object_block(mgr=mgr).fillna(original_value, - limit=limit, - inplace=inplace, - downcast=False) + # operate column-by-column + def f(m, v, i): + block = self.coerce_to_target_dtype(value) + + # slice out our block + if i is not None: + block = block.getitem_block(slice(i, i + 1)) + return block.fillna(value, + limit=limit, + inplace=inplace, + downcast=None) + + return self.split_and_operate(mask, f, inplace) + + def split_and_operate(self, mask, f, inplace): + """ + split the block per-column, and apply the callable f + per-column, return a new block for each. Handle + masking which will not change a block unless needed. + + Parameters + ---------- + mask : 2-d boolean mask + f : callable accepting (1d-mask, 1d values, indexer) + inplace : boolean + + Returns + ------- + list of blocks + """ + + if mask is None: + mask = np.ones(self.shape, dtype=bool) + new_values = self.values + + def make_a_block(nv, ref_loc): + if isinstance(nv, Block): + block = nv + elif isinstance(nv, list): + block = nv[0] + else: + # Put back the dimension that was taken from it and make + # a block out of the result. + try: + nv = _block_shape(nv, ndim=self.ndim) + except (AttributeError, NotImplementedError): + pass + block = self.make_block(values=nv, + placement=ref_loc, fastpath=True) + return block + + # ndim == 1 + if self.ndim == 1: + if mask.any(): + nv = f(mask, new_values, None) + else: + nv = new_values if inplace else new_values.copy() + block = make_a_block(nv, self.mgr_locs) + return [block] + + # ndim > 1 + new_blocks = [] + for i, ref_loc in enumerate(self.mgr_locs): + m = mask[i] + v = new_values[i] + + # need a new block + if m.any(): + nv = f(m, v, i) + else: + nv = v if inplace else v.copy() + + block = make_a_block(nv, [ref_loc]) + new_blocks.append(block) + + return new_blocks def _maybe_downcast(self, blocks, downcast=None): @@ -415,6 +481,8 @@ def _maybe_downcast(self, blocks, downcast=None): elif downcast is None and (self.is_timedelta or self.is_datetime): return blocks + if not isinstance(blocks, list): + blocks = [blocks] return _extend_blocks([b.downcast(downcast) for b in blocks]) def downcast(self, dtypes=None, mgr=None): @@ -444,27 +512,20 @@ def downcast(self, dtypes=None, mgr=None): raise ValueError("downcast must have a dictionary or 'infer' as " "its argument") - # item-by-item + # operate column-by-column # this is expensive as it splits the blocks items-by-item - blocks = [] - for i, rl in enumerate(self.mgr_locs): + def f(m, v, i): if dtypes == 'infer': dtype = 'infer' else: raise AssertionError("dtypes as dict is not supported yet") - # TODO: This either should be completed or removed - dtype = dtypes.get(item, self._downcast_dtype) # noqa - if dtype is None: - nv = _block_shape(values[i], ndim=self.ndim) - else: - nv = maybe_downcast_to_dtype(values[i], dtype) - nv = _block_shape(nv, ndim=self.ndim) + if dtype is not None: + v = maybe_downcast_to_dtype(v, dtype) + return v - blocks.append(self.make_block(nv, fastpath=True, placement=[rl])) - - return blocks + return self.split_and_operate(None, f, False) def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs): return self._astype(dtype, copy=copy, errors=errors, values=values, @@ -545,11 +606,14 @@ def convert(self, copy=True, **kwargs): return self.copy() if copy else self - def _can_hold_element(self, value): - raise NotImplementedError() - - def _try_cast(self, value): - raise NotImplementedError() + def _can_hold_element(self, element): + """ require the same dtype as ourselves """ + dtype = self.values.dtype.type + if is_list_like(element): + element = np.asarray(element) + tipo = element.dtype.type + return issubclass(tipo, dtype) + return isinstance(element, dtype) def _try_cast_result(self, result, dtype=None): """ try to cast the result to our original type, we may have @@ -584,12 +648,16 @@ def _try_cast_result(self, result, dtype=None): # may need to change the dtype here return maybe_downcast_to_dtype(result, dtype) - def _try_operate(self, values): - """ return a version to operate on as the input """ - return values - def _try_coerce_args(self, values, other): """ provide coercion to our input arguments """ + + if np.any(notnull(other)) and not self._can_hold_element(other): + # coercion issues + # let higher levels handle + raise TypeError("cannot convert {} to an {}".format( + type(other).__name__, + type(self).__name__.lower().replace('Block', ''))) + return values, False, other, False def _try_coerce_result(self, result): @@ -601,9 +669,6 @@ def _try_coerce_and_cast_result(self, result, dtype=None): result = self._try_cast_result(result, dtype=dtype) return result - def _try_fill(self, value): - return value - def to_native_types(self, slicer=None, na_rep='nan', quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ @@ -639,7 +704,7 @@ def replace(self, to_replace, value, inplace=False, filter=None, inplace = validate_bool_kwarg(inplace, 'inplace') original_to_replace = to_replace - mask = isnull(self.values) + # try to replace, if we raise an error, convert to ObjectBlock and # retry try: @@ -657,11 +722,9 @@ def replace(self, to_replace, value, inplace=False, filter=None, return blocks except (TypeError, ValueError): - # we can't process the value, but nothing to do - if not mask.any(): - return self if inplace else self.copy() - - return self.to_object_block(mgr=mgr).replace( + # try again with a compatible block + block = self.astype(object) + return block.replace( to_replace=original_to_replace, value=value, inplace=inplace, filter=filter, regex=regex, convert=convert) @@ -676,14 +739,48 @@ def setitem(self, indexer, value, mgr=None): indexer is a direct slice/positional indexer; value must be a compatible shape """ - # coerce None values, if appropriate if value is None: if self.is_numeric: value = np.nan - # coerce args - values, _, value, _ = self._try_coerce_args(self.values, value) + # coerce if block dtype can store value + values = self.values + try: + values, _, value, _ = self._try_coerce_args(values, value) + # can keep its own dtype + if hasattr(value, 'dtype') and is_dtype_equal(values.dtype, + value.dtype): + dtype = self.dtype + else: + dtype = 'infer' + + except (TypeError, ValueError): + # current dtype cannot store value, coerce to common dtype + find_dtype = False + + if hasattr(value, 'dtype'): + dtype = value.dtype + find_dtype = True + + elif is_scalar(value): + if isnull(value): + # NaN promotion is handled in latter path + dtype = False + else: + dtype, _ = infer_dtype_from_scalar(value, + pandas_dtype=True) + find_dtype = True + else: + dtype = 'infer' + + if find_dtype: + dtype = find_common_type([values.dtype, dtype]) + if not is_dtype_equal(self.dtype, dtype): + b = self.astype(dtype) + return b.setitem(indexer, value, mgr=mgr) + + # value must be storeable at this moment arr_value = np.array(value) # cast the values to a type that can hold nan (if necessary) @@ -713,87 +810,58 @@ def setitem(self, indexer, value, mgr=None): raise ValueError("cannot set using a slice indexer with a " "different length than the value") - try: - - def _is_scalar_indexer(indexer): - # return True if we are all scalar indexers + def _is_scalar_indexer(indexer): + # return True if we are all scalar indexers - if arr_value.ndim == 1: - if not isinstance(indexer, tuple): - indexer = tuple([indexer]) - return all([is_scalar(idx) for idx in indexer]) - return False - - def _is_empty_indexer(indexer): - # return a boolean if we have an empty indexer - - if arr_value.ndim == 1: - if not isinstance(indexer, tuple): - indexer = tuple([indexer]) + if arr_value.ndim == 1: + if not isinstance(indexer, tuple): + indexer = tuple([indexer]) return any(isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer) - return False - - # empty indexers - # 8669 (empty) - if _is_empty_indexer(indexer): - pass - - # setting a single element for each dim and with a rhs that could - # be say a list - # GH 6043 - elif _is_scalar_indexer(indexer): - values[indexer] = value - - # if we are an exact match (ex-broadcasting), - # then use the resultant dtype - elif (len(arr_value.shape) and - arr_value.shape[0] == values.shape[0] and - np.prod(arr_value.shape) == np.prod(values.shape)): - values[indexer] = value - values = values.astype(arr_value.dtype) - - # set - else: - values[indexer] = value + return False - # coerce and try to infer the dtypes of the result - if hasattr(value, 'dtype') and is_dtype_equal(values.dtype, - value.dtype): - dtype = value.dtype - elif is_scalar(value): - dtype, _ = infer_dtype_from_scalar(value) - else: - dtype = 'infer' - values = self._try_coerce_and_cast_result(values, dtype) - block = self.make_block(transf(values), fastpath=True) + def _is_empty_indexer(indexer): + # return a boolean if we have an empty indexer - # may have to soft convert_objects here - if block.is_object and not self.is_object: - block = block.convert(numeric=False) + if is_list_like(indexer) and not len(indexer): + return True + if arr_value.ndim == 1: + if not isinstance(indexer, tuple): + indexer = tuple([indexer]) + return any(isinstance(idx, np.ndarray) and len(idx) == 0 + for idx in indexer) + return False - return block - except ValueError: - raise - except TypeError: + # empty indexers + # 8669 (empty) + if _is_empty_indexer(indexer): + pass - # cast to the passed dtype if possible - # otherwise raise the original error + # setting a single element for each dim and with a rhs that could + # be say a list + # GH 6043 + elif _is_scalar_indexer(indexer): + values[indexer] = value + + # if we are an exact match (ex-broadcasting), + # then use the resultant dtype + elif (len(arr_value.shape) and + arr_value.shape[0] == values.shape[0] and + np.prod(arr_value.shape) == np.prod(values.shape)): + values[indexer] = value try: - # e.g. we are uint32 and our value is uint64 - # this is for compat with older numpies - block = self.make_block(transf(values.astype(value.dtype))) - return block.setitem(indexer=indexer, value=value, mgr=mgr) - - except: + values = values.astype(arr_value.dtype) + except ValueError: pass - raise - - except Exception: - pass + # set + else: + values[indexer] = value - return [self] + # coerce and try to infer the dtypes of the result + values = self._try_coerce_and_cast_result(values, dtype) + block = self.make_block(transf(values), fastpath=True) + return block def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False, mgr=None): @@ -830,11 +898,11 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, new = self.fill_value if self._can_hold_element(new): + _, _, new, _ = self._try_coerce_args(new_values, new) + if transpose: new_values = new_values.T - new = self._try_cast(new) - # If the default repeat behavior in np.putmask would go in the # wrong direction, then explictly repeat and reshape new instead if getattr(new, 'ndim', 0) >= 1: @@ -843,6 +911,23 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, new, new_values.shape[-1]).reshape(self.shape) new = new.astype(new_values.dtype) + # we require exact matches between the len of the + # values we are setting (or is compat). np.putmask + # doesn't check this and will simply truncate / pad + # the output, but we want sane error messages + # + # TODO: this prob needs some better checking + # for 2D cases + if ((is_list_like(new) and + np.any(mask[mask]) and + getattr(new, 'ndim', 1) == 1)): + + if not (mask.shape[-1] == len(new) or + mask[mask].shape[-1] == len(new) or + len(new) == 1): + raise ValueError("cannot assign mismatch " + "length to masked array") + np.putmask(new_values, mask, new) # maybe upcast me @@ -860,41 +945,29 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, new_shape.insert(axis, 1) new = new.reshape(tuple(new_shape)) - # need to go column by column - new_blocks = [] - if self.ndim > 1: - for i, ref_loc in enumerate(self.mgr_locs): - m = mask[i] - v = new_values[i] - - # need a new block - if m.any(): - if isinstance(new, np.ndarray): - n = np.squeeze(new[i % new.shape[0]]) - else: - n = np.array(new) - - # type of the new block - dtype, _ = maybe_promote(n.dtype) + # operate column-by-column + def f(m, v, i): - # we need to explicitly astype here to make a copy - n = n.astype(dtype) + if i is None: + # ndim==1 case. + n = new + else: - nv = _putmask_smart(v, m, n) + if isinstance(new, np.ndarray): + n = np.squeeze(new[i % new.shape[0]]) else: - nv = v if inplace else v.copy() + n = np.array(new) - # Put back the dimension that was taken from it and make - # a block out of the result. - block = self.make_block(values=nv[np.newaxis], - placement=[ref_loc], fastpath=True) + # type of the new block + dtype, _ = maybe_promote(n.dtype) - new_blocks.append(block) + # we need to explicitly astype here to make a copy + n = n.astype(dtype) - else: - nv = _putmask_smart(new_values, mask, new) - new_blocks.append(self.make_block(values=nv, fastpath=True)) + nv = _putmask_smart(v, m, n) + return nv + new_blocks = self.split_and_operate(mask, f, inplace) return new_blocks if inplace: @@ -905,6 +978,67 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, return [self.make_block(new_values, fastpath=True)] + def coerce_to_target_dtype(self, other): + """ + coerce the current block to a dtype compat for other + we will return a block, possibly object, and not raise + + we can also safely try to coerce to the same dtype + and will receive the same block + """ + + # if we cannot then coerce to object + dtype, _ = infer_dtype_from(other, pandas_dtype=True) + + if is_dtype_equal(self.dtype, dtype): + return self + + if self.is_bool or is_object_dtype(dtype) or is_bool_dtype(dtype): + # we don't upcast to bool + return self.astype(object) + + elif ((self.is_float or self.is_complex) and + (is_integer_dtype(dtype) or is_float_dtype(dtype))): + # don't coerce float/complex to int + return self + + elif (self.is_datetime or + is_datetime64_dtype(dtype) or + is_datetime64tz_dtype(dtype)): + + # not a datetime + if not ((is_datetime64_dtype(dtype) or + is_datetime64tz_dtype(dtype)) and self.is_datetime): + return self.astype(object) + + # don't upcast timezone with different timezone or no timezone + mytz = getattr(self.dtype, 'tz', None) + othertz = getattr(dtype, 'tz', None) + + if str(mytz) != str(othertz): + return self.astype(object) + + raise AssertionError("possible recursion in " + "coerce_to_target_dtype: {} {}".format( + self, other)) + + elif (self.is_timedelta or is_timedelta64_dtype(dtype)): + + # not a timedelta + if not (is_timedelta64_dtype(dtype) and self.is_timedelta): + return self.astype(object) + + raise AssertionError("possible recursion in " + "coerce_to_target_dtype: {} {}".format( + self, other)) + + try: + return self.astype(dtype) + except (ValueError, TypeError): + pass + + return self.astype(object) + def interpolate(self, method='pad', axis=0, index=None, values=None, inplace=False, limit=None, limit_direction='forward', fill_value=None, coerce=False, downcast=None, mgr=None, @@ -972,7 +1106,6 @@ def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, values = self.values if inplace else self.values.copy() values, _, fill_value, _ = self._try_coerce_args(values, fill_value) - values = self._try_operate(values) values = missing.interpolate_2d(values, method=method, axis=axis, limit=limit, fill_value=fill_value, dtype=self.dtype) @@ -1111,6 +1244,7 @@ def eval(self, func, other, raise_on_error=True, try_cast=False, mgr=None): ------- a new block, the result of the func """ + orig_other = other values = self.values if hasattr(other, 'reindex_axis'): @@ -1135,8 +1269,14 @@ def eval(self, func, other, raise_on_error=True, try_cast=False, mgr=None): transf = (lambda x: x.T) if is_transposed else (lambda x: x) # coerce/transpose the args if needed - values, values_mask, other, other_mask = self._try_coerce_args( - transf(values), other) + try: + values, values_mask, other, other_mask = self._try_coerce_args( + transf(values), other) + except TypeError: + block = self.coerce_to_target_dtype(orig_other) + return block.eval(func, orig_other, + raise_on_error=raise_on_error, + try_cast=try_cast, mgr=mgr) # get the result, may need to transpose the other def get_result(other): @@ -1163,7 +1303,7 @@ def get_result(other): result = result.astype('float64', copy=False) result[other_mask.ravel()] = np.nan - return self._try_coerce_result(result) + return result # error handler if we have an issue operating with the function def handle_error(): @@ -1211,6 +1351,7 @@ def handle_error(): if try_cast: result = self._try_cast_result(result) + result = _block_shape(result, ndim=self.ndim) return [self.make_block(result, fastpath=True, )] def where(self, other, cond, align=True, raise_on_error=True, @@ -1233,8 +1374,8 @@ def where(self, other, cond, align=True, raise_on_error=True, ------- a new block(s), the result of the func """ - values = self.values + orig_other = other if transpose: values = values.T @@ -1254,9 +1395,6 @@ def where(self, other, cond, align=True, raise_on_error=True, raise ValueError("where must have a condition that is ndarray " "like") - other = maybe_convert_string_to_object(other) - other = maybe_convert_scalar(other) - # our where function def func(cond, values, other): if cond.ravel().all(): @@ -1264,6 +1402,7 @@ def func(cond, values, other): values, values_mask, other, other_mask = self._try_coerce_args( values, other) + try: return self._try_coerce_result(expressions.where( cond, values, other, raise_on_error=True)) @@ -1279,7 +1418,19 @@ def func(cond, values, other): # see if we can operate on the entire block, or need item-by-item # or if we are a single block (ndim == 1) - result = func(cond, values, other) + try: + result = func(cond, values, other) + except TypeError: + + # we cannot coerce, return a compat dtype + # we are explicity ignoring raise_on_error here + block = self.coerce_to_target_dtype(other) + blocks = block.where(orig_other, cond, align=align, + raise_on_error=raise_on_error, + try_cast=try_cast, axis=axis, + transpose=transpose) + return self._maybe_downcast(blocks, 'infer') + if self._can_hold_na or self.ndim == 1: if transpose: @@ -1543,6 +1694,7 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, new = new[mask] mask = _safe_reshape(mask, new_values.shape) + new_values[mask] = new new_values = self._try_coerce_result(new_values) return [self.make_block(values=new_values)] @@ -1578,20 +1730,14 @@ class FloatBlock(FloatOrComplexBlock): def _can_hold_element(self, element): if is_list_like(element): - element = np.array(element) + element = np.asarray(element) tipo = element.dtype.type return (issubclass(tipo, (np.floating, np.integer)) and not issubclass(tipo, (np.datetime64, np.timedelta64))) - return (isinstance(element, (float, int, np.float_, np.int_)) and + return (isinstance(element, (float, int, np.floating, np.int_)) and not isinstance(element, (bool, np.bool_, datetime, timedelta, np.datetime64, np.timedelta64))) - def _try_cast(self, element): - try: - return float(element) - except: # pragma: no cover - return element - def to_native_types(self, slicer=None, na_rep='', float_format=None, decimal='.', quoting=None, **kwargs): """ convert to our native types format, slicing if desired """ @@ -1639,13 +1785,7 @@ def _can_hold_element(self, element): (np.floating, np.integer, np.complexfloating)) return (isinstance(element, (float, int, complex, np.float_, np.int_)) and - not isinstance(bool, np.bool_)) - - def _try_cast(self, element): - try: - return complex(element) - except: # pragma: no cover - return element + not isinstance(element, (bool, np.bool_))) def should_store(self, value): return issubclass(value.dtype.type, np.complexfloating) @@ -1661,15 +1801,10 @@ def _can_hold_element(self, element): element = np.array(element) tipo = element.dtype.type return (issubclass(tipo, np.integer) and - not issubclass(tipo, (np.datetime64, np.timedelta64))) + not issubclass(tipo, (np.datetime64, np.timedelta64)) and + self.dtype.itemsize >= element.dtype.itemsize) return is_integer(element) - def _try_cast(self, element): - try: - return int(element) - except: # pragma: no cover - return element - def should_store(self, value): return is_integer_dtype(value) and value.dtype == self.dtype @@ -1684,10 +1819,6 @@ def _na_value(self): def fill_value(self): return tslib.iNaT - def _try_operate(self, values): - """ return a version to operate on """ - return values.view('i8') - def get_values(self, dtype=None): """ return object dtype as boxed values, such as Timestamps/Timedelta @@ -1708,11 +1839,18 @@ class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock): def _box_func(self): return lambda x: tslib.Timedelta(x, unit='ns') + def _can_hold_element(self, element): + if is_list_like(element): + element = np.array(element) + tipo = element.dtype.type + return issubclass(tipo, np.timedelta64) + return isinstance(element, (timedelta, np.timedelta64)) + def fillna(self, value, **kwargs): # allow filling with integers to be # interpreted as seconds - if not isinstance(value, np.timedelta64) and is_integer(value): + if is_integer(value) and not isinstance(value, np.timedelta64): value = Timedelta(value, unit='s') return super(TimeDeltaBlock, self).fillna(value, **kwargs) @@ -1743,19 +1881,18 @@ def _try_coerce_args(self, values, other): elif isinstance(other, Timedelta): other_mask = isnull(other) other = other.value + elif isinstance(other, timedelta): + other = Timedelta(other).value elif isinstance(other, np.timedelta64): other_mask = isnull(other) other = Timedelta(other).value - elif isinstance(other, timedelta): - other = Timedelta(other).value - elif isinstance(other, np.ndarray): + elif hasattr(other, 'dtype') and is_timedelta64_dtype(other): other_mask = isnull(other) other = other.astype('i8', copy=False).view('i8') else: - # scalar - other = Timedelta(other) - other_mask = isnull(other) - other = other.value + # coercion issues + # let higher levels handle + raise TypeError return values, values_mask, other, other_mask @@ -1805,15 +1942,9 @@ class BoolBlock(NumericBlock): def _can_hold_element(self, element): if is_list_like(element): - element = np.array(element) - return issubclass(element.dtype.type, np.integer) - return isinstance(element, (int, bool)) - - def _try_cast(self, element): - try: - return bool(element) - except: # pragma: no cover - return element + element = np.asarray(element) + return issubclass(element.dtype.type, np.bool_) + return isinstance(element, (bool, np.bool_)) def should_store(self, value): return issubclass(value.dtype.type, np.bool_) @@ -1881,31 +2012,24 @@ def convert(self, *args, **kwargs): if key in kwargs: fn_kwargs[key] = kwargs[key] - # attempt to create new type blocks - blocks = [] - if by_item and not self._is_single_block: - - for i, rl in enumerate(self.mgr_locs): - values = self.iget(i) + # operate column-by-column + def f(m, v, i): + shape = v.shape + values = fn(v.ravel(), **fn_kwargs) + try: + values = values.reshape(shape) + values = _block_shape(values, ndim=self.ndim) + except (AttributeError, NotImplementedError): + pass - shape = values.shape - values = fn(values.ravel(), **fn_kwargs) - try: - values = values.reshape(shape) - values = _block_shape(values, ndim=self.ndim) - except (AttributeError, NotImplementedError): - pass - newb = make_block(values, ndim=self.ndim, placement=[rl]) - blocks.append(newb) + return values + if by_item and not self._is_single_block: + blocks = self.split_and_operate(None, f, False) else: - values = fn(self.values.ravel(), **fn_kwargs) - try: - values = values.reshape(self.values.shape) - except NotImplementedError: - pass - blocks.append(make_block(values, ndim=self.ndim, - placement=self.mgr_locs)) + values = f(None, self.values.ravel(), None) + blocks = [make_block(values, ndim=self.ndim, + placement=self.mgr_locs)] return blocks @@ -1949,8 +2073,14 @@ def _maybe_downcast(self, blocks, downcast=None): def _can_hold_element(self, element): return True - def _try_cast(self, element): - return element + def _try_coerce_args(self, values, other): + """ provide coercion to our input arguments """ + + if isinstance(other, ABCDatetimeIndex): + # to store DatetimeTZBlock as object + other = other.asobject.values + + return values, False, other, False def should_store(self, value): return not (issubclass(value.dtype.type, @@ -2249,12 +2379,6 @@ def _can_hold_element(self, element): return (is_integer(element) or isinstance(element, datetime) or isnull(element)) - def _try_cast(self, element): - try: - return int(element) - except: - return element - def _try_coerce_args(self, values, other): """ Coerce values and other to dtype 'i8'. NaN and NaT convert to @@ -2288,19 +2412,13 @@ def _try_coerce_args(self, values, other): "naive Block") other_mask = isnull(other) other = other.asm8.view('i8') - elif hasattr(other, 'dtype') and is_integer_dtype(other): - other = other.view('i8') + elif hasattr(other, 'dtype') and is_datetime64_dtype(other): + other_mask = isnull(other) + other = other.astype('i8', copy=False).view('i8') else: - try: - other = np.asarray(other) - other_mask = isnull(other) - - other = other.astype('i8', copy=False).view('i8') - except ValueError: - - # coercion issues - # let higher levels handle - raise TypeError + # coercion issues + # let higher levels handle + raise TypeError return values, values_mask, other, other_mask @@ -2400,21 +2518,6 @@ def get_values(self, dtype=None): self.values.ravel(), f).reshape(self.values.shape) return self.values - def to_object_block(self, mgr): - """ - return myself as an object block - - Since we keep the DTI as a 1-d object, this is different - depends on BlockManager's ndim - """ - values = self.get_values(dtype=object) - kwargs = {} - if mgr.ndim > 1: - values = _block_shape(values, ndim=mgr.ndim) - kwargs['ndim'] = mgr.ndim - kwargs['placement'] = [0] - return self.make_block(values, klass=ObjectBlock, **kwargs) - def _slice(self, slicer): """ return a slice of my values """ if isinstance(slicer, tuple): @@ -2466,6 +2569,8 @@ def _try_coerce_args(self, values, other): raise ValueError("incompatible or non tz-aware value") other_mask = isnull(other) other = other.value + else: + raise TypeError return values, values_mask, other, other_mask @@ -3246,16 +3351,6 @@ def comp(s): return isnull(values) return _maybe_compare(values, getattr(s, 'asm8', s), operator.eq) - def _cast_scalar(block, scalar): - dtype, val = infer_dtype_from_scalar(scalar, pandas_dtype=True) - if not is_dtype_equal(block.dtype, dtype): - dtype = find_common_type([block.dtype, dtype]) - block = block.astype(dtype) - # use original value - val = scalar - - return block, val - masks = [comp(s) for i, s in enumerate(src_list)] result_blocks = [] @@ -3278,8 +3373,8 @@ def _cast_scalar(block, scalar): # particular block m = masks[i][b.mgr_locs.indexer] if m.any(): - b, val = _cast_scalar(b, d) - new_rb.extend(b.putmask(m, val, inplace=True)) + b = b.coerce_to_target_dtype(d) + new_rb.extend(b.putmask(m, d, inplace=True)) else: new_rb.append(b) rb = new_rb @@ -4757,17 +4852,30 @@ def _transform_index(index, func, level=None): def _putmask_smart(v, m, n): """ - Return a new block, try to preserve dtype if possible. + Return a new ndarray, try to preserve dtype if possible. Parameters ---------- v : `values`, updated in-place (array like) m : `mask`, applies to both sides (array like) n : `new values` either scalar or an array like aligned with `values` + + Returns + ------- + values : ndarray with updated values + this *may* be a copy of the original + + See Also + -------- + ndarray.putmask """ + + # we cannot use np.asarray() here as we cannot have conversions + # that numpy does when numeric are mixed with strings + # n should be the length of the mask or a scalar here if not is_list_like(n): - n = np.array([n] * len(m)) + n = np.repeat(n, len(m)) elif isinstance(n, np.ndarray) and n.ndim == 0: # numpy scalar n = np.repeat(np.array(n, ndmin=1), len(m)) @@ -4781,10 +4889,21 @@ def _putmask_smart(v, m, n): if not _is_na_compat(v, nn[0]): raise ValueError - nn_at = nn.astype(v.dtype) + # we ignore ComplexWarning here + with catch_warnings(record=True): + nn_at = nn.astype(v.dtype) # avoid invalid dtype comparisons - if not is_numeric_v_string_like(nn, nn_at): + # between numbers & strings + + # only compare integers/floats + # don't compare integers to datetimelikes + if (not is_numeric_v_string_like(nn, nn_at) and + (is_float_dtype(nn.dtype) or + is_integer_dtype(nn.dtype) and + is_float_dtype(nn_at.dtype) or + is_integer_dtype(nn_at.dtype))): + comp = (nn == nn_at) if is_list_like(comp) and comp.all(): nv = v.copy() @@ -4793,21 +4912,28 @@ def _putmask_smart(v, m, n): except (ValueError, IndexError, TypeError): pass - # change the dtype + n = np.asarray(n) + + def _putmask_preserve(nv, n): + try: + nv[m] = n[m] + except (IndexError, ValueError): + nv[m] = n + return nv + + # preserves dtype if possible + if v.dtype.kind == n.dtype.kind: + return _putmask_preserve(v, n) + + # change the dtype if needed dtype, _ = maybe_promote(n.dtype) if is_extension_type(v.dtype) and is_object_dtype(dtype): - nv = v.get_values(dtype) + v = v.get_values(dtype) else: - nv = v.astype(dtype) + v = v.astype(dtype) - try: - nv[m] = n[m] - except ValueError: - idx, = np.where(np.squeeze(m)) - for mask_index, new_val in zip(idx, n[m]): - nv[mask_index] = new_val - return nv + return _putmask_preserve(v, n) def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 55473ec8d7cad..017afcd691194 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -1278,12 +1278,14 @@ def f(self, other, axis=default_axis, level=None): other = _align_method_FRAME(self, other, axis) if isinstance(other, pd.DataFrame): # Another DataFrame - return self._flex_compare_frame(other, na_op, str_rep, level) + return self._flex_compare_frame(other, na_op, str_rep, level, + try_cast=False) elif isinstance(other, ABCSeries): - return self._combine_series(other, na_op, None, axis, level) + return self._combine_series(other, na_op, None, axis, level, + try_cast=False) else: - return self._combine_const(other, na_op) + return self._combine_const(other, na_op, try_cast=False) f.__name__ = name @@ -1296,12 +1298,14 @@ def f(self, other): if isinstance(other, pd.DataFrame): # Another DataFrame return self._compare_frame(other, func, str_rep) elif isinstance(other, ABCSeries): - return self._combine_series_infer(other, func) + return self._combine_series_infer(other, func, try_cast=False) else: # straight boolean comparisions we want to allow all columns # (regardless of dtype to pass thru) See #4537 for discussion. - res = self._combine_const(other, func, raise_on_error=False) + res = self._combine_const(other, func, + raise_on_error=False, + try_cast=False) return res.fillna(True).astype(bool) f.__name__ = name @@ -1381,13 +1385,13 @@ def f(self, other, axis=None): axis = self._get_axis_number(axis) if isinstance(other, self._constructor): - return self._compare_constructor(other, na_op) + return self._compare_constructor(other, na_op, try_cast=False) elif isinstance(other, (self._constructor_sliced, pd.DataFrame, ABCSeries)): raise Exception("input needs alignment for this object [%s]" % self._constructor) else: - return self._combine_const(other, na_op) + return self._combine_const(other, na_op, try_cast=False) f.__name__ = name diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 69a8468552f54..609bf3186344a 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -8,6 +8,7 @@ import warnings from pandas.core.dtypes.cast import ( infer_dtype_from_scalar, + cast_scalar_to_array, maybe_cast_item) from pandas.core.dtypes.common import ( is_integer, is_list_like, @@ -178,11 +179,9 @@ def _init_data(self, data, copy, dtype, **kwargs): copy = False dtype = None elif is_scalar(data) and all(x is not None for x in passed_axes): - if dtype is None: - dtype, data = infer_dtype_from_scalar(data) - values = np.empty([len(x) for x in passed_axes], dtype=dtype) - values.fill(data) - mgr = self._init_matrix(values, passed_axes, dtype=dtype, + values = cast_scalar_to_array([len(x) for x in passed_axes], + data, dtype=dtype) + mgr = self._init_matrix(values, passed_axes, dtype=values.dtype, copy=False) copy = False else: # pragma: no cover @@ -327,7 +326,7 @@ def _init_matrix(self, data, axes, dtype=None, copy=False): # ---------------------------------------------------------------------- # Comparison methods - def _compare_constructor(self, other, func): + def _compare_constructor(self, other, func, try_cast=True): if not self._indexed_same(other): raise Exception('Can only compare identically-labeled ' 'same type objects') @@ -584,9 +583,7 @@ def __setitem__(self, key, value): shape[1:], tuple(map(int, value.shape)))) mat = np.asarray(value) elif is_scalar(value): - dtype, value = infer_dtype_from_scalar(value) - mat = np.empty(shape[1:], dtype=dtype) - mat.fill(value) + mat = cast_scalar_to_array(shape[1:], value) else: raise TypeError('Cannot set item of type: %s' % str(type(value))) @@ -719,13 +716,13 @@ def _combine(self, other, func, axis=0): "operation with %s" % (str(type(other)), str(type(self)))) - def _combine_const(self, other, func): + def _combine_const(self, other, func, try_cast=True): with np.errstate(all='ignore'): new_values = func(self.values, other) d = self._construct_axes_dict() return self._constructor(new_values, **d) - def _combine_frame(self, other, func, axis=0): + def _combine_frame(self, other, func, axis=0, try_cast=True): index, columns = self._get_plane_axes(axis) axis = self._get_axis_number(axis) @@ -744,7 +741,7 @@ def _combine_frame(self, other, func, axis=0): return self._constructor(new_values, self.items, self.major_axis, self.minor_axis) - def _combine_panel(self, other, func): + def _combine_panel(self, other, func, try_cast=True): items = self.items.union(other.items) major = self.major_axis.union(other.major_axis) minor = self.minor_axis.union(other.minor_axis) diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 5fe96d70fc16f..462fb18618949 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -500,7 +500,8 @@ def xs(self, key, axis=0, copy=False): # ---------------------------------------------------------------------- # Arithmetic-related methods - def _combine_frame(self, other, func, fill_value=None, level=None): + def _combine_frame(self, other, func, fill_value=None, level=None, + try_cast=True): this, other = self.align(other, join='outer', level=level, copy=False) new_index, new_columns = this.index, this.columns @@ -543,7 +544,8 @@ def _combine_frame(self, other, func, fill_value=None, level=None): default_fill_value=new_fill_value ).__finalize__(self) - def _combine_match_index(self, other, func, level=None, fill_value=None): + def _combine_match_index(self, other, func, level=None, fill_value=None, + try_cast=True): new_data = {} if fill_value is not None: @@ -573,7 +575,8 @@ def _combine_match_index(self, other, func, level=None, fill_value=None): new_data, index=new_index, columns=self.columns, default_fill_value=fill_value).__finalize__(self) - def _combine_match_columns(self, other, func, level=None, fill_value=None): + def _combine_match_columns(self, other, func, level=None, fill_value=None, + try_cast=True): # patched version of DataFrame._combine_match_columns to account for # NumPy circumventing __rsub__ with float64 types, e.g.: 3.0 - series, # where 3.0 is numpy.float64 and series is a SparseSeries. Still @@ -599,7 +602,7 @@ def _combine_match_columns(self, other, func, level=None, fill_value=None): new_data, index=self.index, columns=union, default_fill_value=self.default_fill_value).__finalize__(self) - def _combine_const(self, other, func, raise_on_error=True): + def _combine_const(self, other, func, raise_on_error=True, try_cast=True): return self._apply_columns(lambda x: func(x, other)) def _reindex_index(self, index, method, copy, level, fill_value=np.nan, diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index 6e07487b3e04f..d9fb458c83529 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -9,11 +9,14 @@ from datetime import datetime, timedelta, date import numpy as np -from pandas import Timedelta, Timestamp, DatetimeIndex, DataFrame, NaT, Series +import pandas as pd +from pandas import (Timedelta, Timestamp, DatetimeIndex, + DataFrame, NaT, Period, Series) from pandas.core.dtypes.cast import ( maybe_downcast_to_dtype, maybe_convert_objects, + cast_scalar_to_array, infer_dtype_from_scalar, infer_dtype_from_array, maybe_convert_string_to_object, @@ -23,6 +26,8 @@ CategoricalDtype, DatetimeTZDtype, PeriodDtype) +from pandas.core.dtypes.common import ( + is_dtype_equal) from pandas.util import testing as tm @@ -96,8 +101,8 @@ def test_datetime_with_timezone(self): class TestInferDtype(object): - def test_infer_dtype_from_scalar(self): - # Test that _infer_dtype_from_scalar is returning correct dtype for int + def testinfer_dtype_from_scalar(self): + # Test that infer_dtype_from_scalar is returning correct dtype for int # and float. for dtypec in [np.uint8, np.int8, np.uint16, np.int16, np.uint32, @@ -137,29 +142,93 @@ def test_infer_dtype_from_scalar(self): dtype, val = infer_dtype_from_scalar(data) assert dtype == 'm8[ns]' + for tz in ['UTC', 'US/Eastern', 'Asia/Tokyo']: + dt = Timestamp(1, tz=tz) + dtype, val = infer_dtype_from_scalar(dt, pandas_dtype=True) + assert dtype == 'datetime64[ns, {0}]'.format(tz) + assert val == dt.value + + dtype, val = infer_dtype_from_scalar(dt) + assert dtype == np.object_ + assert val == dt + + for freq in ['M', 'D']: + p = Period('2011-01-01', freq=freq) + dtype, val = infer_dtype_from_scalar(p, pandas_dtype=True) + assert dtype == 'period[{0}]'.format(freq) + assert val == p.ordinal + + dtype, val = infer_dtype_from_scalar(p) + dtype == np.object_ + assert val == p + + # misc for data in [date(2000, 1, 1), Timestamp(1, tz='US/Eastern'), 'foo']: + dtype, val = infer_dtype_from_scalar(data) assert dtype == np.object_ + def testinfer_dtype_from_scalar_errors(self): + with pytest.raises(ValueError): + infer_dtype_from_scalar(np.array([1])) + @pytest.mark.parametrize( - "arr, expected", - [('foo', np.object_), - (b'foo', np.object_), - (1, np.int_), - (1.5, np.float_), - ([1], np.int_), - (np.array([1]), np.int_), - ([np.nan, 1, ''], np.object_), - (np.array([[1.0, 2.0]]), np.float_), - (Timestamp('20160101'), np.object_), - (np.datetime64('2016-01-01'), np.dtype(' multiple Blocks) - df = pd.concat([DataFrame(np.random.randn(10, 2)), - DataFrame(np.random.randint(0, 10, size=(10, 2)))], - ignore_index=True, axis=1) + df = pd.concat([ + DataFrame(np.random.randn(10, 2)), + DataFrame(np.random.randint(0, 10, size=(10, 2)), dtype='int64')], + ignore_index=True, axis=1) mask = DataFrame(False, columns=df.columns, index=df.index) s1 = Series(1, index=df.columns) s2 = Series(2, index=df.index) result = df.where(mask, s1, axis='columns') expected = DataFrame(1.0, columns=df.columns, index=df.index) - expected[2] = expected[2].astype(int) - expected[3] = expected[3].astype(int) + expected[2] = expected[2].astype('int64') + expected[3] = expected[3].astype('int64') assert_frame_equal(result, expected) result = df.copy() @@ -2742,8 +2744,8 @@ def test_where_axis(self): result = df.where(mask, s2, axis='index') expected = DataFrame(2.0, columns=df.columns, index=df.index) - expected[2] = expected[2].astype(int) - expected[3] = expected[3].astype(int) + expected[2] = expected[2].astype('int64') + expected[3] = expected[3].astype('int64') assert_frame_equal(result, expected) result = df.copy() diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index 8ec6c6e6263d8..438d7481ecc3e 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -188,6 +188,7 @@ def test_timestamp_compare(self): df.loc[np.random.rand(len(df)) > 0.5, 'dates2'] = pd.NaT ops = {'gt': 'lt', 'lt': 'gt', 'ge': 'le', 'le': 'ge', 'eq': 'eq', 'ne': 'ne'} + for left, right in ops.items(): left_f = getattr(operator, left) right_f = getattr(operator, right) @@ -315,14 +316,12 @@ def _check_unary_op(op): # operator.neg is deprecated in numpy >= 1.9 _check_unary_op(operator.inv) - def test_logical_typeerror(self): - if not compat.PY3: - pytest.raises(TypeError, self.frame.__eq__, 'foo') - pytest.raises(TypeError, self.frame.__lt__, 'foo') - pytest.raises(TypeError, self.frame.__gt__, 'foo') - pytest.raises(TypeError, self.frame.__ne__, 'foo') - else: - pytest.skip('test_logical_typeerror not tested on PY3') + @pytest.mark.parametrize('op,res', [('__eq__', False), + ('__ne__', True)]) + def test_logical_typeerror_with_non_valid(self, op, res): + # we are comparing floats vs a string + result = getattr(self.frame, op)('foo') + assert bool(result.all().all()) is res def test_logical_with_nas(self): d = DataFrame({'a': [np.nan, False], 'b': [True, True]}) @@ -832,9 +831,11 @@ def test_combineSeries(self): assert 'E' in larger_added assert np.isnan(larger_added['E']).all() - # vs mix (upcast) as needed + # no upcast needed added = self.mixed_float + series - _check_mixed_float(added, dtype='float64') + _check_mixed_float(added) + + # vs mix (upcast) as needed added = self.mixed_float + series.astype('float32') _check_mixed_float(added, dtype=dict(C=None)) added = self.mixed_float + series.astype('float16') diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 25cc810299678..752d2deb53304 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -101,9 +101,22 @@ def test_setitem_series_int64(self): exp = pd.Series([1, 1 + 1j, 3, 4]) self._assert_setitem_series_conversion(obj, 1 + 1j, exp, np.complex128) - # int + bool -> int - exp = pd.Series([1, 1, 3, 4]) - self._assert_setitem_series_conversion(obj, True, exp, np.int64) + # int + bool -> object + exp = pd.Series([1, True, 3, 4]) + self._assert_setitem_series_conversion(obj, True, exp, np.object) + + def test_setitem_series_int8(self): + # integer dtype coercion (no change) + obj = pd.Series([1, 2, 3, 4], dtype=np.int8) + assert obj.dtype == np.int8 + + exp = pd.Series([1, 1, 3, 4], dtype=np.int8) + self._assert_setitem_series_conversion(obj, np.int32(1), exp, np.int8) + + # BUG: it must be Series([1, 1, 3, 4], dtype=np.int16) + exp = pd.Series([1, 0, 3, 4], dtype=np.int8) + self._assert_setitem_series_conversion(obj, np.int16(2**9), exp, + np.int8) def test_setitem_series_float64(self): obj = pd.Series([1.1, 2.2, 3.3, 4.4]) @@ -122,9 +135,9 @@ def test_setitem_series_float64(self): self._assert_setitem_series_conversion(obj, 1 + 1j, exp, np.complex128) - # float + bool -> float - exp = pd.Series([1.1, 1.0, 3.3, 4.4]) - self._assert_setitem_series_conversion(obj, True, exp, np.float64) + # float + bool -> object + exp = pd.Series([1.1, True, 3.3, 4.4]) + self._assert_setitem_series_conversion(obj, True, exp, np.object) def test_setitem_series_complex128(self): obj = pd.Series([1 + 1j, 2 + 2j, 3 + 3j, 4 + 4j]) @@ -132,7 +145,7 @@ def test_setitem_series_complex128(self): # complex + int -> complex exp = pd.Series([1 + 1j, 1, 3 + 3j, 4 + 4j]) - self._assert_setitem_series_conversion(obj, True, exp, np.complex128) + self._assert_setitem_series_conversion(obj, 1, exp, np.complex128) # complex + float -> complex exp = pd.Series([1 + 1j, 1.1, 3 + 3j, 4 + 4j]) @@ -142,9 +155,9 @@ def test_setitem_series_complex128(self): exp = pd.Series([1 + 1j, 1 + 1j, 3 + 3j, 4 + 4j]) self._assert_setitem_series_conversion(obj, 1 + 1j, exp, np.complex128) - # complex + bool -> complex - exp = pd.Series([1 + 1j, 1, 3 + 3j, 4 + 4j]) - self._assert_setitem_series_conversion(obj, True, exp, np.complex128) + # complex + bool -> object + exp = pd.Series([1 + 1j, True, 3 + 3j, 4 + 4j]) + self._assert_setitem_series_conversion(obj, True, exp, np.object) def test_setitem_series_bool(self): obj = pd.Series([True, False, True, False]) @@ -198,14 +211,18 @@ def test_setitem_series_datetime64(self): exp, 'datetime64[ns]') # datetime64 + int -> object - # ToDo: The result must be object exp = pd.Series([pd.Timestamp('2011-01-01'), - pd.Timestamp(1), + 1, pd.Timestamp('2011-01-03'), pd.Timestamp('2011-01-04')]) - self._assert_setitem_series_conversion(obj, 1, exp, 'datetime64[ns]') + self._assert_setitem_series_conversion(obj, 1, exp, 'object') - # ToDo: add more tests once the above issue has been fixed + # datetime64 + object -> object + exp = pd.Series([pd.Timestamp('2011-01-01'), + 'x', + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + self._assert_setitem_series_conversion(obj, 'x', exp, np.object) def test_setitem_series_datetime64tz(self): tz = 'US/Eastern' @@ -224,19 +241,59 @@ def test_setitem_series_datetime64tz(self): self._assert_setitem_series_conversion(obj, value, exp, 'datetime64[ns, US/Eastern]') + # datetime64tz + datetime64tz (different tz) -> object + exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2012-01-01', tz='US/Pacific'), + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + value = pd.Timestamp('2012-01-01', tz='US/Pacific') + self._assert_setitem_series_conversion(obj, value, exp, np.object) + + # datetime64tz + datetime64 -> object + exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), + pd.Timestamp('2012-01-01'), + pd.Timestamp('2011-01-03', tz=tz), + pd.Timestamp('2011-01-04', tz=tz)]) + value = pd.Timestamp('2012-01-01') + self._assert_setitem_series_conversion(obj, value, exp, np.object) + # datetime64 + int -> object - # ToDo: The result must be object exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp(1, tz=tz), + 1, pd.Timestamp('2011-01-03', tz=tz), pd.Timestamp('2011-01-04', tz=tz)]) - self._assert_setitem_series_conversion(obj, 1, exp, - 'datetime64[ns, US/Eastern]') + self._assert_setitem_series_conversion(obj, 1, exp, np.object) # ToDo: add more tests once the above issue has been fixed def test_setitem_series_timedelta64(self): - pass + obj = pd.Series([pd.Timedelta('1 day'), + pd.Timedelta('2 day'), + pd.Timedelta('3 day'), + pd.Timedelta('4 day')]) + assert obj.dtype == 'timedelta64[ns]' + + # timedelta64 + timedelta64 -> timedelta64 + exp = pd.Series([pd.Timedelta('1 day'), + pd.Timedelta('12 day'), + pd.Timedelta('3 day'), + pd.Timedelta('4 day')]) + self._assert_setitem_series_conversion(obj, pd.Timedelta('12 day'), + exp, 'timedelta64[ns]') + + # timedelta64 + int -> object + exp = pd.Series([pd.Timedelta('1 day'), + 1, + pd.Timedelta('3 day'), + pd.Timedelta('4 day')]) + self._assert_setitem_series_conversion(obj, 1, exp, np.object) + + # timedelta64 + object -> object + exp = pd.Series([pd.Timedelta('1 day'), + 'x', + pd.Timedelta('3 day'), + pd.Timedelta('4 day')]) + self._assert_setitem_series_conversion(obj, 'x', exp, np.object) def test_setitem_series_period(self): pass @@ -610,13 +667,13 @@ def _where_int64_common(self, klass): self._assert_where_conversion(obj, cond, values, exp, np.complex128) - # int + bool -> int - exp = klass([1, 1, 3, 1]) - self._assert_where_conversion(obj, cond, True, exp, np.int64) + # int + bool -> object + exp = klass([1, True, 3, True]) + self._assert_where_conversion(obj, cond, True, exp, np.object) values = klass([True, False, True, True]) - exp = klass([1, 0, 3, 1]) - self._assert_where_conversion(obj, cond, values, exp, np.int64) + exp = klass([1, False, 3, True]) + self._assert_where_conversion(obj, cond, values, exp, np.object) def test_where_series_int64(self): self._where_int64_common(pd.Series) @@ -656,13 +713,13 @@ def _where_float64_common(self, klass): self._assert_where_conversion(obj, cond, values, exp, np.complex128) - # float + bool -> float - exp = klass([1.1, 1.0, 3.3, 1.0]) - self._assert_where_conversion(obj, cond, True, exp, np.float64) + # float + bool -> object + exp = klass([1.1, True, 3.3, True]) + self._assert_where_conversion(obj, cond, True, exp, np.object) values = klass([True, False, True, True]) - exp = klass([1.1, 0.0, 3.3, 1.0]) - self._assert_where_conversion(obj, cond, values, exp, np.float64) + exp = klass([1.1, False, 3.3, True]) + self._assert_where_conversion(obj, cond, values, exp, np.object) def test_where_series_float64(self): self._where_float64_common(pd.Series) @@ -699,45 +756,46 @@ def test_where_series_complex128(self): exp = pd.Series([1 + 1j, 6 + 6j, 3 + 3j, 8 + 8j]) self._assert_where_conversion(obj, cond, values, exp, np.complex128) - # complex + bool -> complex - exp = pd.Series([1 + 1j, 1, 3 + 3j, 1]) - self._assert_where_conversion(obj, cond, True, exp, np.complex128) + # complex + bool -> object + exp = pd.Series([1 + 1j, True, 3 + 3j, True]) + self._assert_where_conversion(obj, cond, True, exp, np.object) values = pd.Series([True, False, True, True]) - exp = pd.Series([1 + 1j, 0, 3 + 3j, 1]) - self._assert_where_conversion(obj, cond, values, exp, np.complex128) + exp = pd.Series([1 + 1j, False, 3 + 3j, True]) + self._assert_where_conversion(obj, cond, values, exp, np.object) def test_where_index_complex128(self): pass def test_where_series_bool(self): + obj = pd.Series([True, False, True, False]) assert obj.dtype == np.bool cond = pd.Series([True, False, True, False]) - # bool + int -> int - exp = pd.Series([1, 1, 1, 1]) - self._assert_where_conversion(obj, cond, 1, exp, np.int64) + # bool + int -> object + exp = pd.Series([True, 1, True, 1]) + self._assert_where_conversion(obj, cond, 1, exp, np.object) values = pd.Series([5, 6, 7, 8]) - exp = pd.Series([1, 6, 1, 8]) - self._assert_where_conversion(obj, cond, values, exp, np.int64) + exp = pd.Series([True, 6, True, 8]) + self._assert_where_conversion(obj, cond, values, exp, np.object) - # bool + float -> float - exp = pd.Series([1.0, 1.1, 1.0, 1.1]) - self._assert_where_conversion(obj, cond, 1.1, exp, np.float64) + # bool + float -> object + exp = pd.Series([True, 1.1, True, 1.1]) + self._assert_where_conversion(obj, cond, 1.1, exp, np.object) values = pd.Series([5.5, 6.6, 7.7, 8.8]) - exp = pd.Series([1.0, 6.6, 1.0, 8.8]) - self._assert_where_conversion(obj, cond, values, exp, np.float64) + exp = pd.Series([True, 6.6, True, 8.8]) + self._assert_where_conversion(obj, cond, values, exp, np.object) - # bool + complex -> complex - exp = pd.Series([1, 1 + 1j, 1, 1 + 1j]) - self._assert_where_conversion(obj, cond, 1 + 1j, exp, np.complex128) + # bool + complex -> object + exp = pd.Series([True, 1 + 1j, True, 1 + 1j]) + self._assert_where_conversion(obj, cond, 1 + 1j, exp, np.object) values = pd.Series([5 + 5j, 6 + 6j, 7 + 7j, 8 + 8j]) - exp = pd.Series([1, 6 + 6j, 1, 8 + 8j]) - self._assert_where_conversion(obj, cond, values, exp, np.complex128) + exp = pd.Series([True, 6 + 6j, True, 8 + 8j]) + self._assert_where_conversion(obj, cond, values, exp, np.object) # bool + bool -> bool exp = pd.Series([True, True, True, True]) @@ -776,10 +834,15 @@ def test_where_series_datetime64(self): pd.Timestamp('2012-01-04')]) self._assert_where_conversion(obj, cond, values, exp, 'datetime64[ns]') - # ToDo: coerce to object - msg = "cannot coerce a Timestamp with a tz on a naive Block" - with tm.assert_raises_regex(TypeError, msg): - obj.where(cond, pd.Timestamp('2012-01-01', tz='US/Eastern')) + # datetime64 + datetime64tz -> object + exp = pd.Series([pd.Timestamp('2011-01-01'), + pd.Timestamp('2012-01-01', tz='US/Eastern'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2012-01-01', tz='US/Eastern')]) + self._assert_where_conversion( + obj, cond, + pd.Timestamp('2012-01-01', tz='US/Eastern'), + exp, np.object) # ToDo: do not coerce to UTC, must be object values = pd.Series([pd.Timestamp('2012-01-01', tz='US/Eastern'), @@ -898,7 +961,7 @@ def test_fillna_series_int64(self): def test_fillna_index_int64(self): pass - def _fillna_float64_common(self, klass): + def _fillna_float64_common(self, klass, complex): obj = klass([1.1, np.nan, 3.3, 4.4]) assert obj.dtype == np.float64 @@ -910,26 +973,21 @@ def _fillna_float64_common(self, klass): exp = klass([1.1, 1.1, 3.3, 4.4]) self._assert_fillna_conversion(obj, 1.1, exp, np.float64) - if klass is pd.Series: - # float + complex -> complex - exp = klass([1.1, 1 + 1j, 3.3, 4.4]) - self._assert_fillna_conversion(obj, 1 + 1j, exp, np.complex128) - elif klass is pd.Index: - # float + complex -> object - exp = klass([1.1, 1 + 1j, 3.3, 4.4]) - self._assert_fillna_conversion(obj, 1 + 1j, exp, np.object) - else: - NotImplementedError + # float + complex -> we don't support a complex Index + # complex for Series, + # object for Index + exp = klass([1.1, 1 + 1j, 3.3, 4.4]) + self._assert_fillna_conversion(obj, 1 + 1j, exp, complex) - # float + bool -> float - exp = klass([1.1, 1.0, 3.3, 4.4]) - self._assert_fillna_conversion(obj, True, exp, np.float64) + # float + bool -> object + exp = klass([1.1, True, 3.3, 4.4]) + self._assert_fillna_conversion(obj, True, exp, np.object) def test_fillna_series_float64(self): - self._fillna_float64_common(pd.Series) + self._fillna_float64_common(pd.Series, complex=np.complex128) def test_fillna_index_float64(self): - self._fillna_float64_common(pd.Index) + self._fillna_float64_common(pd.Index, complex=np.object) def test_fillna_series_complex128(self): obj = pd.Series([1 + 1j, np.nan, 3 + 3j, 4 + 4j]) @@ -947,12 +1005,12 @@ def test_fillna_series_complex128(self): exp = pd.Series([1 + 1j, 1 + 1j, 3 + 3j, 4 + 4j]) self._assert_fillna_conversion(obj, 1 + 1j, exp, np.complex128) - # complex + bool -> complex - exp = pd.Series([1 + 1j, 1, 3 + 3j, 4 + 4j]) - self._assert_fillna_conversion(obj, True, exp, np.complex128) + # complex + bool -> object + exp = pd.Series([1 + 1j, True, 3 + 3j, 4 + 4j]) + self._assert_fillna_conversion(obj, True, exp, np.object) def test_fillna_index_complex128(self): - self._fillna_float64_common(pd.Index) + self._fillna_float64_common(pd.Index, complex=np.object) def test_fillna_series_bool(self): # bool can't hold NaN @@ -985,12 +1043,11 @@ def test_fillna_series_datetime64(self): self._assert_fillna_conversion(obj, value, exp, np.object) # datetime64 + int => object - # ToDo: must be coerced to object exp = pd.Series([pd.Timestamp('2011-01-01'), - pd.Timestamp(1), + 1, pd.Timestamp('2011-01-03'), pd.Timestamp('2011-01-04')]) - self._assert_fillna_conversion(obj, 1, exp, 'datetime64[ns]') + self._assert_fillna_conversion(obj, 1, exp, 'object') # datetime64 + object => object exp = pd.Series([pd.Timestamp('2011-01-01'), @@ -1033,14 +1090,12 @@ def test_fillna_series_datetime64tz(self): value = pd.Timestamp('2012-01-01', tz='Asia/Tokyo') self._assert_fillna_conversion(obj, value, exp, np.object) - # datetime64tz + int => datetime64tz - # ToDo: must be object + # datetime64tz + int => object exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), - pd.Timestamp(1, tz=tz), + 1, pd.Timestamp('2011-01-03', tz=tz), pd.Timestamp('2011-01-04', tz=tz)]) - self._assert_fillna_conversion(obj, 1, exp, - 'datetime64[ns, US/Eastern]') + self._assert_fillna_conversion(obj, 1, exp, np.object) # datetime64tz + object => object exp = pd.Series([pd.Timestamp('2011-01-01', tz=tz), @@ -1187,8 +1242,8 @@ def _assert_replace_conversion(self, from_key, to_key, how): (from_key == 'complex128' and to_key in ('int64', 'float64'))): - # buggy on 32-bit - if tm.is_platform_32bit(): + # buggy on 32-bit / window + if compat.is_platform_32bit() or compat.is_platform_windows(): pytest.skip("32-bit platform buggy: {0} -> {1}".format (from_key, to_key)) diff --git a/pandas/tests/indexing/test_datetime.py b/pandas/tests/indexing/test_datetime.py index 8e8fc835b11f7..ddac80fbc4693 100644 --- a/pandas/tests/indexing/test_datetime.py +++ b/pandas/tests/indexing/test_datetime.py @@ -1,5 +1,3 @@ -import pytest - import numpy as np import pandas as pd from pandas import date_range, Index, DataFrame, Series, Timestamp @@ -83,10 +81,12 @@ def test_indexing_with_datetime_tz(self): 'US/Pacific') # trying to set a single element on a part of a different timezone - def f(): - df.loc[df.new_col == 'new', 'time'] = v + # this converts to object + df2 = df.copy() + df2.loc[df2.new_col == 'new', 'time'] = v - pytest.raises(ValueError, f) + expected = Series([v[0], df.loc[1, 'time']], name='time') + tm.assert_series_equal(df2.time, expected) v = df.loc[df.new_col == 'new', 'time'] + pd.Timedelta('1s') df.loc[df.new_col == 'new', 'time'] = v diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 98f5d5eb140df..e5b70a9fadb8f 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -382,6 +382,12 @@ def test_multi_assign(self): tm.assert_frame_equal(df2, expected) # with an ndarray on rhs + # coerces to float64 because values has float64 dtype + # GH 14001 + expected = DataFrame({'FC': ['a', np.nan, 'a', 'b', 'a', 'b'], + 'PF': [0, 0, 0, 0, 1, 1], + 'col1': [0., 1., 4., 6., 8., 10.], + 'col2': [12, 7, 16, np.nan, 20, 22]}) df2 = df.copy() df2.loc[mask, cols] = dft.loc[mask, cols].values tm.assert_frame_equal(df2, expected) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 7aab7df7169d4..a736f3aa74558 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1033,11 +1033,11 @@ def test_clip_with_datetimes(self): # naive and tz-aware datetimes t = Timestamp('2015-12-01 09:30:30') - s = Series([Timestamp('2015-12-01 09:30:00'), Timestamp( - '2015-12-01 09:31:00')]) + s = Series([Timestamp('2015-12-01 09:30:00'), + Timestamp('2015-12-01 09:31:00')]) result = s.clip(upper=t) - expected = Series([Timestamp('2015-12-01 09:30:00'), Timestamp( - '2015-12-01 09:30:30')]) + expected = Series([Timestamp('2015-12-01 09:30:00'), + Timestamp('2015-12-01 09:30:30')]) assert_series_equal(result, expected) t = Timestamp('2015-12-01 09:30:30', tz='US/Eastern') diff --git a/pandas/tests/series/test_indexing.py b/pandas/tests/series/test_indexing.py index 6d8a54b538237..23283733c492a 100644 --- a/pandas/tests/series/test_indexing.py +++ b/pandas/tests/series/test_indexing.py @@ -1094,6 +1094,11 @@ def test_where(self): rs = s2.where(cond[:3], -s2) assert_series_equal(rs, expected) + def test_where_error(self): + + s = Series(np.random.randn(5)) + cond = s > 0 + pytest.raises(ValueError, s.where, 1) pytest.raises(ValueError, s.where, cond[:3].values, -s) @@ -1109,6 +1114,8 @@ def test_where(self): pytest.raises(ValueError, s.__setitem__, tuple([[[True, False]]]), []) + def test_where_unsafe(self): + # unsafe dtype changes for dtype in [np.int8, np.int16, np.int32, np.int64, np.float16, np.float32, np.float64]: @@ -1374,9 +1381,9 @@ def test_where_dups(self): expected = Series([5, 11, 2, 5, 11, 2], index=[0, 1, 2, 0, 1, 2]) assert_series_equal(comb, expected) - def test_where_datetime(self): + def test_where_datetime_conversion(self): s = Series(date_range('20130102', periods=2)) - expected = Series([10, 10], dtype='datetime64[ns]') + expected = Series([10, 10]) mask = np.array([False, False]) rs = s.where(mask, [10, 10]) @@ -1392,7 +1399,7 @@ def test_where_datetime(self): assert_series_equal(rs, expected) rs = s.where(mask, [10.0, np.nan]) - expected = Series([10, None], dtype='datetime64[ns]') + expected = Series([10, None], dtype='object') assert_series_equal(rs, expected) # GH 15701 @@ -1403,9 +1410,9 @@ def test_where_datetime(self): expected = Series([pd.NaT, s[1]]) assert_series_equal(rs, expected) - def test_where_timedelta(self): + def test_where_timedelta_coerce(self): s = Series([1, 2], dtype='timedelta64[ns]') - expected = Series([10, 10], dtype='timedelta64[ns]') + expected = Series([10, 10]) mask = np.array([False, False]) rs = s.where(mask, [10, 10]) @@ -1421,7 +1428,7 @@ def test_where_timedelta(self): assert_series_equal(rs, expected) rs = s.where(mask, [10.0, np.nan]) - expected = Series([10, None], dtype='timedelta64[ns]') + expected = Series([10, None], dtype='object') assert_series_equal(rs, expected) def test_mask(self): diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index b5948e75aa73e..24dd90e40fa35 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -58,14 +58,14 @@ def test_remove_na_deprecation(self): def test_timedelta_fillna(self): # GH 3371 - s = Series([Timestamp('20130101'), Timestamp('20130101'), Timestamp( - '20130102'), Timestamp('20130103 9:01:01')]) + s = Series([Timestamp('20130101'), Timestamp('20130101'), + Timestamp('20130102'), Timestamp('20130103 9:01:01')]) td = s.diff() # reg fillna result = td.fillna(0) - expected = Series([timedelta(0), timedelta(0), timedelta(1), timedelta( - days=1, seconds=9 * 3600 + 60 + 1)]) + expected = Series([timedelta(0), timedelta(0), timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1)]) assert_series_equal(result, expected) # interprested as seconds @@ -75,8 +75,9 @@ def test_timedelta_fillna(self): assert_series_equal(result, expected) result = td.fillna(timedelta(days=1, seconds=1)) - expected = Series([timedelta(days=1, seconds=1), timedelta( - 0), timedelta(1), timedelta(days=1, seconds=9 * 3600 + 60 + 1)]) + expected = Series([timedelta(days=1, seconds=1), timedelta(0), + timedelta(1), + timedelta(days=1, seconds=9 * 3600 + 60 + 1)]) assert_series_equal(result, expected) result = td.fillna(np.timedelta64(int(1e9))) @@ -144,6 +145,7 @@ def test_datetime64_fillna(self): assert_series_equal(result, expected) def test_datetime64_tz_fillna(self): + for tz in ['US/Eastern', 'Asia/Tokyo']: # DatetimeBlock s = Series([Timestamp('2011-01-01 10:00'), pd.NaT, @@ -278,6 +280,40 @@ def test_datetime64_tz_fillna(self): pd.Timestamp('2012-11-11 00:00:00+01:00')]) assert_series_equal(df.fillna(method='bfill'), exp) + def test_fillna_consistency(self): + # GH 16402 + # fillna with a tz aware to a tz-naive, should result in object + + s = Series([Timestamp('20130101'), pd.NaT]) + + result = s.fillna(Timestamp('20130101', tz='US/Eastern')) + expected = Series([Timestamp('20130101'), + Timestamp('2013-01-01', tz='US/Eastern')], + dtype='object') + assert_series_equal(result, expected) + + # where (we ignore the raise_on_error) + result = s.where([True, False], + Timestamp('20130101', tz='US/Eastern'), + raise_on_error=False) + assert_series_equal(result, expected) + + result = s.where([True, False], + Timestamp('20130101', tz='US/Eastern'), + raise_on_error=True) + assert_series_equal(result, expected) + + # with a non-datetime + result = s.fillna('foo') + expected = Series([Timestamp('20130101'), + 'foo']) + assert_series_equal(result, expected) + + # assignment + s2 = s.copy() + s2[1] = 'foo' + assert_series_equal(s2, expected) + def test_datetime64tz_fillna_round_issue(self): # GH 14872