From c62bb856c86c9a2cd3e6ee60f306afbdbfea96e7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 11 Mar 2020 20:15:15 -0700 Subject: [PATCH 1/5] BUG: DTI/TDI/PI get_indexer_non_unique --- pandas/core/indexes/base.py | 5 +--- pandas/core/indexes/datetimelike.py | 26 ++++++++++++++++++-- pandas/core/indexes/datetimes.py | 23 +++++++++++++++++- pandas/core/indexes/period.py | 20 +++++++++++----- pandas/core/indexes/timedeltas.py | 7 ++++++ pandas/tests/indexes/test_base.py | 37 +++++++++++++++++++++++++++++ 6 files changed, 105 insertions(+), 13 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 98e3b3ad258ea..7d2b811b3ee39 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -29,7 +29,6 @@ ensure_platform_int, is_bool, is_bool_dtype, - is_categorical, is_categorical_dtype, is_datetime64_any_dtype, is_dtype_equal, @@ -4652,10 +4651,8 @@ def get_indexer_non_unique(self, target): if pself is not self or ptarget is not target: return pself.get_indexer_non_unique(ptarget) - if is_categorical(target): + if is_categorical_dtype(target.dtype): tgt_values = np.asarray(target) - elif self.is_all_dates and target.is_all_dates: # GH 30399 - tgt_values = target.asi8 else: tgt_values = target._ndarray_values diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 894e1d95a17bc..4dd9c9af56528 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -8,13 +8,14 @@ from pandas._libs import NaT, iNaT, join as libjoin, lib from pandas._libs.tslibs import timezones -from pandas._typing import Label +from pandas._typing import DtypeObj, Label from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( ensure_int64, + ensure_platform_int, is_bool_dtype, is_categorical_dtype, is_dtype_equal, @@ -33,7 +34,7 @@ from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin from pandas.core.base import _shared_docs import pandas.core.indexes.base as ibase -from pandas.core.indexes.base import Index, _index_shared_docs +from pandas.core.indexes.base import Index, _index_shared_docs, ensure_index from pandas.core.indexes.extension import ( ExtensionIndex, inherit_names, @@ -102,6 +103,12 @@ class DatetimeIndexOpsMixin(ExtensionIndex): def is_all_dates(self) -> bool: return True + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + """ + Can we compare values of the given dtype to our own? + """ + raise AbstractMethodError(self) + # ------------------------------------------------------------------------ # Abstract data attributes @@ -427,6 +434,21 @@ def _partial_date_slice( # try to find the dates return (lhs_mask & rhs_mask).nonzero()[0] + @Appender(Index.get_indexer_non_unique.__doc__) + def get_indexer_non_unique(self, target): + target = ensure_index(target) + pself, ptarget = self._maybe_promote(target) + if pself is not self or ptarget is not target: + return pself.get_indexer_non_unique(ptarget) + + if not self._is_comparable_dtype(target.dtype): + no_matches = -1 * np.ones(self.shape, dtype=np.intp) + return no_matches, no_matches + + tgt_values = target.asi8 + indexer, missing = self._engine.get_indexer_non_unique(tgt_values) + return ensure_platform_int(indexer), missing + # -------------------------------------------------------------------- __add__ = make_wrapped_arith_op("__add__") diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 185ad8e4c365a..6607e450b1203 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -7,9 +7,18 @@ from pandas._libs import NaT, Period, Timestamp, index as libindex, lib, tslib as libts from pandas._libs.tslibs import fields, parsing, timezones +from pandas._typing import DtypeObj from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.common import _NS_DTYPE, is_float, is_integer, is_scalar +from pandas.core.dtypes.common import ( + _NS_DTYPE, + is_datetime64_any_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_float, + is_integer, + is_scalar, +) from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import is_valid_nat_for_dtype @@ -306,6 +315,18 @@ def _convert_for_op(self, value): return Timestamp(value).asm8 raise ValueError("Passed item and index have different timezone") + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + """ + Can we compare values of the given dtype to our own? + """ + if not is_datetime64_any_dtype(dtype): + return False + if self.tz is not None: + # If we have tz, we can compare to tzaware + return is_datetime64tz_dtype(dtype) + # if we dont have tz, we can only compare to tznaive + return is_datetime64_dtype(dtype) + # -------------------------------------------------------------------- # Rendering Methods diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 9ab80c0b6145c..a978969d37739 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -9,7 +9,7 @@ from pandas._libs.tslibs import frequencies as libfrequencies, resolution from pandas._libs.tslibs.parsing import parse_time_string from pandas._libs.tslibs.period import Period -from pandas._typing import Label +from pandas._typing import DtypeObj, Label from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( @@ -20,6 +20,7 @@ is_float, is_integer, is_object_dtype, + is_period_dtype, is_scalar, pandas_dtype, ) @@ -295,6 +296,14 @@ def _maybe_convert_timedelta(self, other): # raise when input doesn't have freq raise raise_on_incompatible(self, None) + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + """ + Can we compare values of the given dtype to our own? + """ + if not is_period_dtype(dtype): + return False + return dtype.freq == self.freq + # ------------------------------------------------------------------------ # Rendering Methods @@ -451,12 +460,11 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): def get_indexer_non_unique(self, target): target = ensure_index(target) - if isinstance(target, PeriodIndex): - if target.freq != self.freq: - no_matches = -1 * np.ones(self.shape, dtype=np.intp) - return no_matches, no_matches + if not self._is_comparable_dtype(target.dtype): + no_matches = -1 * np.ones(self.shape, dtype=np.intp) + return no_matches, no_matches - target = target.asi8 + target = target.asi8 indexer, missing = self._int64index.get_indexer_non_unique(target) return ensure_platform_int(indexer), missing diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 5e4a8e83bd95b..f884dcc5961eb 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -1,6 +1,7 @@ """ implement the TimedeltaIndex """ from pandas._libs import NaT, Timedelta, index as libindex +from pandas._typing import DtypeObj from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -215,6 +216,12 @@ def _maybe_promote(self, other): other = TimedeltaIndex(other) return self, other + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + """ + Can we compare values of the given dtype to our own? + """ + return is_timedelta64_dtype(dtype) + def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 2981f56e58ecc..86657f89953ec 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -2616,3 +2616,40 @@ def test_convert_almost_null_slice(indices): msg = "'>=' not supported between instances of 'str' and 'int'" with pytest.raises(TypeError, match=msg): idx._convert_slice_indexer(key, "loc") + + +dtlike_dtypes = [ + np.dtype("timedelta64[ns]"), + np.dtype("datetime64[ns]"), + pd.DatetimeTZDtype("ns", "Asia/Tokyo"), + pd.PeriodDtype("ns"), +] + + +@pytest.mark.parametrize("ldtype", dtlike_dtypes) +@pytest.mark.parametrize("rdtype", dtlike_dtypes) +def test_get_indexer_non_unique_wrong_dtype(ldtype, rdtype): + + vals = np.tile(3600 * 10 ** 9 * np.arange(3), 2) + + def construct(dtype): + if dtype is dtlike_dtypes[-1]: + # PeriodArray will try to cast ints to strings + return pd.DatetimeIndex(vals).astype(dtype) + return pd.Index(vals, dtype=dtype) + + left = construct(ldtype) + right = construct(rdtype) + + result = left.get_indexer_non_unique(right) + + if ldtype is rdtype: + ex1 = np.array([0, 3, 1, 4, 2, 5] * 2, dtype=np.intp) + ex2 = np.array([], dtype=np.intp) + tm.assert_numpy_array_equal(result[0], ex1) + tm.assert_numpy_array_equal(result[1], ex2) + + else: + no_matches = np.array([-1] * 6, dtype=np.intp) + tm.assert_numpy_array_equal(result[0], no_matches) + tm.assert_numpy_array_equal(result[1], no_matches) From ef8a99863f34a4e87b62d09072b1471aa16f53f7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 12 Mar 2020 08:35:59 -0700 Subject: [PATCH 2/5] mypy fixup --- pandas/core/indexes/period.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index ad65737f20bb6..a2a0052744524 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -20,10 +20,10 @@ is_float, is_integer, is_object_dtype, - is_period_dtype, is_scalar, pandas_dtype, ) +from pandas.core.dtypes.dtypes import PeriodDtype from pandas.core.arrays.period import ( PeriodArray, @@ -301,7 +301,7 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: """ Can we compare values of the given dtype to our own? """ - if not is_period_dtype(dtype): + if not isinstance(dtype, PeriodDtype): return False return dtype.freq == self.freq From 3ec183a151044c1f9dccb11e5914eea4d1fa88c1 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 12 Mar 2020 08:38:17 -0700 Subject: [PATCH 3/5] troubleshoot in int32 build --- pandas/tests/indexes/test_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 86657f89953ec..5bdbc18769ce5 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -2647,7 +2647,7 @@ def construct(dtype): ex1 = np.array([0, 3, 1, 4, 2, 5] * 2, dtype=np.intp) ex2 = np.array([], dtype=np.intp) tm.assert_numpy_array_equal(result[0], ex1) - tm.assert_numpy_array_equal(result[1], ex2) + tm.assert_numpy_array_equal(result[1], ex2.astype(np.int64)) else: no_matches = np.array([-1] * 6, dtype=np.intp) From 9e4597d9dbe861e4b41550d18bc0bafa0be797c4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 13 Mar 2020 20:34:47 -0700 Subject: [PATCH 4/5] test+whatsnew --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/tests/indexing/test_loc.py | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5b6f70be478c2..2ef49903702d0 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -266,6 +266,7 @@ Indexing - Bug in :meth:`DataFrame.iat` incorrectly returning ``Timestamp`` instead of ``datetime`` in some object-dtype cases (:issue:`32809`) - Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` when indexing with an integer key on a object-dtype :class:`Index` that is not all-integers (:issue:`31905`) - Bug in :meth:`DataFrame.iloc.__setitem__` on a :class:`DataFrame` with duplicate columns incorrectly setting values for all matching columns (:issue:`15686`, :issue:`22036`) +- Bug in :meth:`DataFrame.loc:` and :meth:`Series.loc` with a :class:`DatetimeIndex`, :class:`TimedeltaIndex`, or :class:`PeriodIndex` incorrectly allowing lookups of non-matching datetime-like dtypes (:issue:`32650`) Missing ^^^^^^^ diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 3073fe085de15..277ed4f83c4df 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1015,3 +1015,24 @@ def test_loc_slice_disallows_positional(): with tm.assert_produces_warning(FutureWarning): # GH#31840 deprecated incorrect behavior df.loc[1:3, 1] = 2 + + +def test_loc_datetimelike_mismatched_dtypes(): + # GH#32650 dont mix and match datetime/timedelta/period dtypes + + df = pd.DataFrame( + np.random.randn(5, 3), + columns=["a", "b", "c"], + index=pd.date_range("2012", freq="H", periods=5), + ) + # create dataframe with non-unique DatetimeIndex + df = df.iloc[[0, 2, 2, 3]].copy() + + dti = df.index + tdi = pd.TimedeltaIndex(dti.asi8) # matching i8 values + + with pytest.raises(TypeError, match="is not convertible to datetime"): + df.loc[tdi] + + with pytest.raises(TypeError, match="is not convertible to datetime"): + df["a"].loc[tdi] From 60bed7935dcad4e0a433647f36c6bd9d54c183ba Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 14 Mar 2020 10:50:25 -0700 Subject: [PATCH 5/5] TypeError->KeyError --- pandas/core/indexes/base.py | 3 +++ pandas/tests/indexing/test_loc.py | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 86da8169bf286..51c9cd881de4e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -531,6 +531,9 @@ def _shallow_copy_with_infer(self, values, **kwargs): return self._constructor(values, **attributes) except (TypeError, ValueError): pass + + # Remove tz so Index will try non-DatetimeIndex inference + attributes.pop("tz", None) return Index(values, **attributes) def _update_inplace(self, result, **kwargs): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 277ed4f83c4df..3cad7874ea957 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1031,8 +1031,9 @@ def test_loc_datetimelike_mismatched_dtypes(): dti = df.index tdi = pd.TimedeltaIndex(dti.asi8) # matching i8 values - with pytest.raises(TypeError, match="is not convertible to datetime"): + msg = r"None of \[TimedeltaIndex.* are in the \[index\]" + with pytest.raises(KeyError, match=msg): df.loc[tdi] - with pytest.raises(TypeError, match="is not convertible to datetime"): + with pytest.raises(KeyError, match=msg): df["a"].loc[tdi]