diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 1663d4c44c362..cdef6c65863fa 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -301,6 +301,7 @@ Indexing - Bug in :meth:`DataFrame.iat` incorrectly returning ``Timestamp`` instead of ``datetime`` in some object-dtype cases (:issue:`32809`) - Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` when indexing with an integer key on a object-dtype :class:`Index` that is not all-integers (:issue:`31905`) - Bug in :meth:`DataFrame.iloc.__setitem__` on a :class:`DataFrame` with duplicate columns incorrectly setting values for all matching columns (:issue:`15686`, :issue:`22036`) +- Bug in :meth:`DataFrame.loc:` and :meth:`Series.loc` with a :class:`DatetimeIndex`, :class:`TimedeltaIndex`, or :class:`PeriodIndex` incorrectly allowing lookups of non-matching datetime-like dtypes (:issue:`32650`) Missing ^^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 31966489403f4..51c9cd881de4e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -29,7 +29,6 @@ ensure_platform_int, is_bool, is_bool_dtype, - is_categorical, is_categorical_dtype, is_datetime64_any_dtype, is_dtype_equal, @@ -532,6 +531,9 @@ def _shallow_copy_with_infer(self, values, **kwargs): return self._constructor(values, **attributes) except (TypeError, ValueError): pass + + # Remove tz so Index will try non-DatetimeIndex inference + attributes.pop("tz", None) return Index(values, **attributes) def _update_inplace(self, result, **kwargs): @@ -4657,10 +4659,8 @@ def get_indexer_non_unique(self, target): if pself is not self or ptarget is not target: return pself.get_indexer_non_unique(ptarget) - if is_categorical(target): + if is_categorical_dtype(target.dtype): tgt_values = np.asarray(target) - elif self.is_all_dates and target.is_all_dates: # GH 30399 - tgt_values = target.asi8 else: tgt_values = target._get_engine_target() diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 054a64bf3f990..878c588c6e51a 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -8,13 +8,14 @@ from pandas._libs import NaT, iNaT, join as libjoin, lib from pandas._libs.tslibs import timezones -from pandas._typing import Label +from pandas._typing import DtypeObj, Label from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( ensure_int64, + ensure_platform_int, is_bool_dtype, is_categorical_dtype, is_dtype_equal, @@ -32,7 +33,7 @@ from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin from pandas.core.base import _shared_docs import pandas.core.indexes.base as ibase -from pandas.core.indexes.base import Index, _index_shared_docs +from pandas.core.indexes.base import Index, _index_shared_docs, ensure_index from pandas.core.indexes.extension import ( ExtensionIndex, inherit_names, @@ -101,6 +102,12 @@ class DatetimeIndexOpsMixin(ExtensionIndex): def is_all_dates(self) -> bool: return True + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + """ + Can we compare values of the given dtype to our own? + """ + raise AbstractMethodError(self) + # ------------------------------------------------------------------------ # Abstract data attributes @@ -426,6 +433,21 @@ def _partial_date_slice( # try to find the dates return (lhs_mask & rhs_mask).nonzero()[0] + @Appender(Index.get_indexer_non_unique.__doc__) + def get_indexer_non_unique(self, target): + target = ensure_index(target) + pself, ptarget = self._maybe_promote(target) + if pself is not self or ptarget is not target: + return pself.get_indexer_non_unique(ptarget) + + if not self._is_comparable_dtype(target.dtype): + no_matches = -1 * np.ones(self.shape, dtype=np.intp) + return no_matches, no_matches + + tgt_values = target.asi8 + indexer, missing = self._engine.get_indexer_non_unique(tgt_values) + return ensure_platform_int(indexer), missing + # -------------------------------------------------------------------- __add__ = make_wrapped_arith_op("__add__") diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index e791133220dbf..ca1995adc1ea9 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -7,10 +7,18 @@ from pandas._libs import NaT, Period, Timestamp, index as libindex, lib, tslib as libts from pandas._libs.tslibs import fields, parsing, timezones -from pandas._typing import Label +from pandas._typing import DtypeObj, Label from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.common import _NS_DTYPE, is_float, is_integer, is_scalar +from pandas.core.dtypes.common import ( + _NS_DTYPE, + is_datetime64_any_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_float, + is_integer, + is_scalar, +) from pandas.core.dtypes.missing import is_valid_nat_for_dtype from pandas.core.arrays.datetimes import DatetimeArray, tz_to_dtype @@ -298,6 +306,18 @@ def _convert_for_op(self, value): return Timestamp(value).asm8 raise ValueError("Passed item and index have different timezone") + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + """ + Can we compare values of the given dtype to our own? + """ + if not is_datetime64_any_dtype(dtype): + return False + if self.tz is not None: + # If we have tz, we can compare to tzaware + return is_datetime64tz_dtype(dtype) + # if we dont have tz, we can only compare to tznaive + return is_datetime64_dtype(dtype) + # -------------------------------------------------------------------- # Rendering Methods diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index f83234f1aac0b..f6bf02b6df676 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -9,7 +9,7 @@ from pandas._libs.tslibs import frequencies as libfrequencies, resolution from pandas._libs.tslibs.parsing import parse_time_string from pandas._libs.tslibs.period import Period -from pandas._typing import Label +from pandas._typing import DtypeObj, Label from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( @@ -23,6 +23,7 @@ is_scalar, pandas_dtype, ) +from pandas.core.dtypes.dtypes import PeriodDtype from pandas.core.arrays.period import ( PeriodArray, @@ -298,6 +299,14 @@ def _maybe_convert_timedelta(self, other): # raise when input doesn't have freq raise raise_on_incompatible(self, None) + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + """ + Can we compare values of the given dtype to our own? + """ + if not isinstance(dtype, PeriodDtype): + return False + return dtype.freq == self.freq + # ------------------------------------------------------------------------ # Rendering Methods @@ -454,12 +463,11 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): def get_indexer_non_unique(self, target): target = ensure_index(target) - if isinstance(target, PeriodIndex): - if target.freq != self.freq: - no_matches = -1 * np.ones(self.shape, dtype=np.intp) - return no_matches, no_matches + if not self._is_comparable_dtype(target.dtype): + no_matches = -1 * np.ones(self.shape, dtype=np.intp) + return no_matches, no_matches - target = target.asi8 + target = target.asi8 indexer, missing = self._int64index.get_indexer_non_unique(target) return ensure_platform_int(indexer), missing diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 7a7670b0e7965..588cb3e37bced 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -1,7 +1,7 @@ """ implement the TimedeltaIndex """ from pandas._libs import NaT, Timedelta, index as libindex -from pandas._typing import Label +from pandas._typing import DtypeObj, Label from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -213,6 +213,12 @@ def _maybe_promote(self, other): other = TimedeltaIndex(other) return self, other + def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: + """ + Can we compare values of the given dtype to our own? + """ + return is_timedelta64_dtype(dtype) + def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 2981f56e58ecc..5bdbc18769ce5 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -2616,3 +2616,40 @@ def test_convert_almost_null_slice(indices): msg = "'>=' not supported between instances of 'str' and 'int'" with pytest.raises(TypeError, match=msg): idx._convert_slice_indexer(key, "loc") + + +dtlike_dtypes = [ + np.dtype("timedelta64[ns]"), + np.dtype("datetime64[ns]"), + pd.DatetimeTZDtype("ns", "Asia/Tokyo"), + pd.PeriodDtype("ns"), +] + + +@pytest.mark.parametrize("ldtype", dtlike_dtypes) +@pytest.mark.parametrize("rdtype", dtlike_dtypes) +def test_get_indexer_non_unique_wrong_dtype(ldtype, rdtype): + + vals = np.tile(3600 * 10 ** 9 * np.arange(3), 2) + + def construct(dtype): + if dtype is dtlike_dtypes[-1]: + # PeriodArray will try to cast ints to strings + return pd.DatetimeIndex(vals).astype(dtype) + return pd.Index(vals, dtype=dtype) + + left = construct(ldtype) + right = construct(rdtype) + + result = left.get_indexer_non_unique(right) + + if ldtype is rdtype: + ex1 = np.array([0, 3, 1, 4, 2, 5] * 2, dtype=np.intp) + ex2 = np.array([], dtype=np.intp) + tm.assert_numpy_array_equal(result[0], ex1) + tm.assert_numpy_array_equal(result[1], ex2.astype(np.int64)) + + else: + no_matches = np.array([-1] * 6, dtype=np.intp) + tm.assert_numpy_array_equal(result[0], no_matches) + tm.assert_numpy_array_equal(result[1], no_matches) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index a0edf78f79a53..ee92e5a69204d 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1073,3 +1073,25 @@ def test_loc_slice_disallows_positional(): with tm.assert_produces_warning(FutureWarning): # GH#31840 deprecated incorrect behavior df.loc[1:3, 1] = 2 + + +def test_loc_datetimelike_mismatched_dtypes(): + # GH#32650 dont mix and match datetime/timedelta/period dtypes + + df = pd.DataFrame( + np.random.randn(5, 3), + columns=["a", "b", "c"], + index=pd.date_range("2012", freq="H", periods=5), + ) + # create dataframe with non-unique DatetimeIndex + df = df.iloc[[0, 2, 2, 3]].copy() + + dti = df.index + tdi = pd.TimedeltaIndex(dti.asi8) # matching i8 values + + msg = r"None of \[TimedeltaIndex.* are in the \[index\]" + with pytest.raises(KeyError, match=msg): + df.loc[tdi] + + with pytest.raises(KeyError, match=msg): + df["a"].loc[tdi]