Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: DTI/TDI/PI get_indexer_non_unique with incompatible dtype #32650

Merged
merged 10 commits into from
Mar 17, 2020
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,7 @@ Indexing
- Bug in :meth:`DataFrame.iat` incorrectly returning ``Timestamp`` instead of ``datetime`` in some object-dtype cases (:issue:`32809`)
- Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` when indexing with an integer key on a object-dtype :class:`Index` that is not all-integers (:issue:`31905`)
- Bug in :meth:`DataFrame.iloc.__setitem__` on a :class:`DataFrame` with duplicate columns incorrectly setting values for all matching columns (:issue:`15686`, :issue:`22036`)
- Bug in :meth:`DataFrame.loc:` and :meth:`Series.loc` with a :class:`DatetimeIndex`, :class:`TimedeltaIndex`, or :class:`PeriodIndex` incorrectly allowing lookups of non-matching datetime-like dtypes (:issue:`32650`)

Missing
^^^^^^^
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
ensure_platform_int,
is_bool,
is_bool_dtype,
is_categorical,
is_categorical_dtype,
is_datetime64_any_dtype,
is_dtype_equal,
Expand Down Expand Up @@ -532,6 +531,9 @@ def _shallow_copy_with_infer(self, values, **kwargs):
return self._constructor(values, **attributes)
except (TypeError, ValueError):
pass

# Remove tz so Index will try non-DatetimeIndex inference
attributes.pop("tz", None)
return Index(values, **attributes)

def _update_inplace(self, result, **kwargs):
Expand Down Expand Up @@ -4657,10 +4659,8 @@ def get_indexer_non_unique(self, target):
if pself is not self or ptarget is not target:
return pself.get_indexer_non_unique(ptarget)

if is_categorical(target):
if is_categorical_dtype(target.dtype):
tgt_values = np.asarray(target)
elif self.is_all_dates and target.is_all_dates: # GH 30399
tgt_values = target.asi8
else:
tgt_values = target._get_engine_target()

Expand Down
26 changes: 24 additions & 2 deletions pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@

from pandas._libs import NaT, iNaT, join as libjoin, lib
from pandas._libs.tslibs import timezones
from pandas._typing import Label
from pandas._typing import DtypeObj, Label
from pandas.compat.numpy import function as nv
from pandas.errors import AbstractMethodError
from pandas.util._decorators import Appender, cache_readonly

from pandas.core.dtypes.common import (
ensure_int64,
ensure_platform_int,
is_bool_dtype,
is_categorical_dtype,
is_dtype_equal,
Expand All @@ -32,7 +33,7 @@
from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin
from pandas.core.base import _shared_docs
import pandas.core.indexes.base as ibase
from pandas.core.indexes.base import Index, _index_shared_docs
from pandas.core.indexes.base import Index, _index_shared_docs, ensure_index
from pandas.core.indexes.extension import (
ExtensionIndex,
inherit_names,
Expand Down Expand Up @@ -101,6 +102,12 @@ class DatetimeIndexOpsMixin(ExtensionIndex):
def is_all_dates(self) -> bool:
return True

def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
"""
Can we compare values of the given dtype to our own?
"""
raise AbstractMethodError(self)

# ------------------------------------------------------------------------
# Abstract data attributes

Expand Down Expand Up @@ -426,6 +433,21 @@ def _partial_date_slice(
# try to find the dates
return (lhs_mask & rhs_mask).nonzero()[0]

@Appender(Index.get_indexer_non_unique.__doc__)
def get_indexer_non_unique(self, target):
target = ensure_index(target)
pself, ptarget = self._maybe_promote(target)
if pself is not self or ptarget is not target:
return pself.get_indexer_non_unique(ptarget)

if not self._is_comparable_dtype(target.dtype):
no_matches = -1 * np.ones(self.shape, dtype=np.intp)
return no_matches, no_matches

tgt_values = target.asi8
indexer, missing = self._engine.get_indexer_non_unique(tgt_values)
return ensure_platform_int(indexer), missing

# --------------------------------------------------------------------

__add__ = make_wrapped_arith_op("__add__")
Expand Down
24 changes: 22 additions & 2 deletions pandas/core/indexes/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,18 @@

from pandas._libs import NaT, Period, Timestamp, index as libindex, lib, tslib as libts
from pandas._libs.tslibs import fields, parsing, timezones
from pandas._typing import Label
from pandas._typing import DtypeObj, Label
from pandas.util._decorators import cache_readonly

from pandas.core.dtypes.common import _NS_DTYPE, is_float, is_integer, is_scalar
from pandas.core.dtypes.common import (
_NS_DTYPE,
is_datetime64_any_dtype,
is_datetime64_dtype,
is_datetime64tz_dtype,
is_float,
is_integer,
is_scalar,
)
from pandas.core.dtypes.missing import is_valid_nat_for_dtype

from pandas.core.arrays.datetimes import DatetimeArray, tz_to_dtype
Expand Down Expand Up @@ -298,6 +306,18 @@ def _convert_for_op(self, value):
return Timestamp(value).asm8
raise ValueError("Passed item and index have different timezone")

def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
"""
Can we compare values of the given dtype to our own?
"""
if not is_datetime64_any_dtype(dtype):
return False
if self.tz is not None:
# If we have tz, we can compare to tzaware
return is_datetime64tz_dtype(dtype)
# if we dont have tz, we can only compare to tznaive
return is_datetime64_dtype(dtype)

# --------------------------------------------------------------------
# Rendering Methods

Expand Down
20 changes: 14 additions & 6 deletions pandas/core/indexes/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from pandas._libs.tslibs import frequencies as libfrequencies, resolution
from pandas._libs.tslibs.parsing import parse_time_string
from pandas._libs.tslibs.period import Period
from pandas._typing import Label
from pandas._typing import DtypeObj, Label
from pandas.util._decorators import Appender, cache_readonly

from pandas.core.dtypes.common import (
Expand All @@ -23,6 +23,7 @@
is_scalar,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import PeriodDtype

from pandas.core.arrays.period import (
PeriodArray,
Expand Down Expand Up @@ -298,6 +299,14 @@ def _maybe_convert_timedelta(self, other):
# raise when input doesn't have freq
raise raise_on_incompatible(self, None)

def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
"""
Can we compare values of the given dtype to our own?
"""
if not isinstance(dtype, PeriodDtype):
return False
return dtype.freq == self.freq

# ------------------------------------------------------------------------
# Rendering Methods

Expand Down Expand Up @@ -454,12 +463,11 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
def get_indexer_non_unique(self, target):
target = ensure_index(target)

if isinstance(target, PeriodIndex):
if target.freq != self.freq:
no_matches = -1 * np.ones(self.shape, dtype=np.intp)
return no_matches, no_matches
if not self._is_comparable_dtype(target.dtype):
no_matches = -1 * np.ones(self.shape, dtype=np.intp)
return no_matches, no_matches

target = target.asi8
target = target.asi8

indexer, missing = self._int64index.get_indexer_non_unique(target)
return ensure_platform_int(indexer), missing
Expand Down
8 changes: 7 additions & 1 deletion pandas/core/indexes/timedeltas.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
""" implement the TimedeltaIndex """

from pandas._libs import NaT, Timedelta, index as libindex
from pandas._typing import Label
from pandas._typing import DtypeObj, Label
from pandas.util._decorators import Appender

from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -213,6 +213,12 @@ def _maybe_promote(self, other):
other = TimedeltaIndex(other)
return self, other

def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
"""
Can we compare values of the given dtype to our own?
"""
return is_timedelta64_dtype(dtype)

def get_loc(self, key, method=None, tolerance=None):
"""
Get integer location for requested label
Expand Down
37 changes: 37 additions & 0 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2616,3 +2616,40 @@ def test_convert_almost_null_slice(indices):
msg = "'>=' not supported between instances of 'str' and 'int'"
with pytest.raises(TypeError, match=msg):
idx._convert_slice_indexer(key, "loc")


dtlike_dtypes = [
np.dtype("timedelta64[ns]"),
np.dtype("datetime64[ns]"),
pd.DatetimeTZDtype("ns", "Asia/Tokyo"),
pd.PeriodDtype("ns"),
]


@pytest.mark.parametrize("ldtype", dtlike_dtypes)
@pytest.mark.parametrize("rdtype", dtlike_dtypes)
def test_get_indexer_non_unique_wrong_dtype(ldtype, rdtype):

vals = np.tile(3600 * 10 ** 9 * np.arange(3), 2)

def construct(dtype):
if dtype is dtlike_dtypes[-1]:
# PeriodArray will try to cast ints to strings
return pd.DatetimeIndex(vals).astype(dtype)
return pd.Index(vals, dtype=dtype)

left = construct(ldtype)
right = construct(rdtype)

result = left.get_indexer_non_unique(right)

if ldtype is rdtype:
ex1 = np.array([0, 3, 1, 4, 2, 5] * 2, dtype=np.intp)
ex2 = np.array([], dtype=np.intp)
tm.assert_numpy_array_equal(result[0], ex1)
tm.assert_numpy_array_equal(result[1], ex2.astype(np.int64))

else:
no_matches = np.array([-1] * 6, dtype=np.intp)
tm.assert_numpy_array_equal(result[0], no_matches)
tm.assert_numpy_array_equal(result[1], no_matches)
22 changes: 22 additions & 0 deletions pandas/tests/indexing/test_loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1073,3 +1073,25 @@ def test_loc_slice_disallows_positional():
with tm.assert_produces_warning(FutureWarning):
# GH#31840 deprecated incorrect behavior
df.loc[1:3, 1] = 2


def test_loc_datetimelike_mismatched_dtypes():
# GH#32650 dont mix and match datetime/timedelta/period dtypes

df = pd.DataFrame(
np.random.randn(5, 3),
columns=["a", "b", "c"],
index=pd.date_range("2012", freq="H", periods=5),
)
# create dataframe with non-unique DatetimeIndex
df = df.iloc[[0, 2, 2, 3]].copy()

dti = df.index
tdi = pd.TimedeltaIndex(dti.asi8) # matching i8 values

msg = r"None of \[TimedeltaIndex.* are in the \[index\]"
with pytest.raises(KeyError, match=msg):
df.loc[tdi]

with pytest.raises(KeyError, match=msg):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i would test the reverse as well (e.g. TDI is the index with a DTI as indexer). This should also raise for Period, I think? (these can be handled as followons)

df["a"].loc[tdi]