diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 9e557a0020f1e..9b9fbe86a6d22 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -201,7 +201,7 @@ Datetimelike - Bug in constructing a :class:`Series` or :class:`DataFrame` with a ``datetime`` object out of bounds for ``datetime64[ns]`` dtype or a ``timedelta`` object ouf of bounds for ``timedelta64[ns]`` dtype (:issue:`38792`, :issue:`38965`) - Bug in :meth:`DatetimeIndex.intersection`, :meth:`DatetimeIndex.symmetric_difference`, :meth:`PeriodIndex.intersection`, :meth:`PeriodIndex.symmetric_difference` always returning object-dtype when operating with :class:`CategoricalIndex` (:issue:`38741`) - Bug in :meth:`Series.where` incorrectly casting ``datetime64`` values to ``int64`` (:issue:`37682`) -- +- Bug in :class:`Categorical` incorrectly typecasting ``datetime`` object to ``Timestamp`` (:issue:`38878`) Timedelta ^^^^^^^^^ @@ -312,8 +312,8 @@ Reshaping - Bug in :func:`merge` raising error when performing an inner join with partial index and ``right_index`` when no overlap between indices (:issue:`33814`) - Bug in :meth:`DataFrame.unstack` with missing levels led to incorrect index names (:issue:`37510`) - Bug in :func:`join` over :class:`MultiIndex` returned wrong result, when one of both indexes had only one level (:issue:`36909`) -- Bug in :func:`concat` incorrectly casting to ``object`` dtype in some cases when one or more of the operands is empty (:issue:`38843`, :issue:`38907`) - :meth:`merge_asof` raises ``ValueError`` instead of cryptic ``TypeError`` in case of non-numerical merge columns (:issue:`29130`) +- Sparse ^^^^^^ diff --git a/pandas/conftest.py b/pandas/conftest.py index 9fc1f0509d232..b62cb52492b97 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -703,8 +703,8 @@ def float_frame(): # ---------------------------------------------------------------- @pytest.fixture( params=[ - (Interval(left=0, right=5), IntervalDtype("int64")), - (Interval(left=0.1, right=0.5), IntervalDtype("float64")), + (Interval(left=0, right=5), IntervalDtype("int64", "right")), + (Interval(left=0.1, right=0.5), IntervalDtype("float64", "right")), (Period("2012-01", freq="M"), "period[M]"), (Period("2012-02-01", freq="D"), "period[D]"), ( diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py index 959a13d9c107d..31f6896b12f98 100644 --- a/pandas/core/arrays/_arrow_utils.py +++ b/pandas/core/arrays/_arrow_utils.py @@ -127,7 +127,7 @@ def __hash__(self): def to_pandas_dtype(self): import pandas as pd - return pd.IntervalDtype(self.subtype.to_pandas_dtype()) + return pd.IntervalDtype(self.subtype.to_pandas_dtype(), self.closed) # register the type with a dummy instance _interval_type = ArrowIntervalType(pyarrow.int64(), "left") diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 872f17b7f0770..284305b588e4e 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -149,7 +149,7 @@ >>> pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]) [(0, 1], (1, 5]] - Length: 2, closed: right, dtype: interval[int64] + Length: 2, closed: right, dtype: interval[int64, right] It may also be constructed using one of the constructor methods: :meth:`IntervalArray.from_arrays`, @@ -222,6 +222,9 @@ def _simple_new( ): result = IntervalMixin.__new__(cls) + if closed is None and isinstance(dtype, IntervalDtype): + closed = dtype.closed + closed = closed or "right" left = ensure_index(left, copy=copy) right = ensure_index(right, copy=copy) @@ -238,6 +241,12 @@ def _simple_new( msg = f"dtype must be an IntervalDtype, got {dtype}" raise TypeError(msg) + if dtype.closed is None: + # possibly loading an old pickle + dtype = IntervalDtype(dtype.subtype, closed) + elif closed != dtype.closed: + raise ValueError("closed keyword does not match dtype.closed") + # coerce dtypes to match if needed if is_float_dtype(left) and is_integer_dtype(right): right = right.astype(left.dtype) @@ -279,9 +288,11 @@ def _simple_new( # If these share data, then setitem could corrupt our IA right = right.copy() + dtype = IntervalDtype(left.dtype, closed=closed) + result._dtype = dtype + result._left = left result._right = right - result._closed = closed if verify_integrity: result._validate() return result @@ -343,7 +354,7 @@ def _from_factorized(cls, values, original): >>> pd.arrays.IntervalArray.from_breaks([0, 1, 2, 3]) [(0, 1], (1, 2], (2, 3]] - Length: 3, closed: right, dtype: interval[int64] + Length: 3, closed: right, dtype: interval[int64, right] """ ), } @@ -414,7 +425,7 @@ def from_breaks( >>> pd.arrays.IntervalArray.from_arrays([0, 1, 2], [1, 2, 3]) [(0, 1], (1, 2], (2, 3]] - Length: 3, closed: right, dtype: interval[int64] + Length: 3, closed: right, dtype: interval[int64, right] """ ), } @@ -473,7 +484,7 @@ def from_arrays( >>> pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 2)]) [(0, 1], (1, 2]] - Length: 2, closed: right, dtype: interval[int64] + Length: 2, closed: right, dtype: interval[int64, right] """ ), } @@ -553,7 +564,7 @@ def _shallow_copy(self, left, right): @property def dtype(self): - return IntervalDtype(self.left.dtype) + return self._dtype @property def nbytes(self) -> int: @@ -1174,7 +1185,7 @@ def mid(self): >>> intervals [(0, 1], (1, 3], (2, 4]] - Length: 3, closed: right, dtype: interval[int64] + Length: 3, closed: right, dtype: interval[int64, right] """ ), } @@ -1203,7 +1214,7 @@ def closed(self): Whether the intervals are closed on the left-side, right-side, both or neither. """ - return self._closed + return self.dtype.closed _interval_shared_docs["set_closed"] = textwrap.dedent( """ @@ -1238,11 +1249,11 @@ def closed(self): >>> index [(0, 1], (1, 2], (2, 3]] - Length: 3, closed: right, dtype: interval[int64] + Length: 3, closed: right, dtype: interval[int64, right] >>> index.set_closed('both') [[0, 1], [1, 2], [2, 3]] - Length: 3, closed: both, dtype: interval[int64] + Length: 3, closed: both, dtype: interval[int64, both] """ ), } @@ -1301,7 +1312,7 @@ def __array__(self, dtype: Optional[NpDtype] = None) -> np.ndarray: left = self._left right = self._right mask = self.isna() - closed = self._closed + closed = self.closed result = np.empty(len(left), dtype=object) for i in range(len(left)): @@ -1441,7 +1452,7 @@ def repeat(self, repeats, axis=None): >>> intervals [(0, 1], (1, 3], (2, 4]] - Length: 3, closed: right, dtype: interval[int64] + Length: 3, closed: right, dtype: interval[int64, right] """ ), } diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 8065f85548f8c..b1b7c28c04ebd 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -779,7 +779,7 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, dtype = PeriodDtype(freq=val.freq) elif lib.is_interval(val): subtype = infer_dtype_from_scalar(val.left, pandas_dtype=True)[0] - dtype = IntervalDtype(subtype=subtype) + dtype = IntervalDtype(subtype=subtype, closed=val.closed) return dtype, val diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index f31bcd97eafcf..d75ae77373403 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -999,8 +999,8 @@ class IntervalDtype(PandasExtensionDtype): Examples -------- - >>> pd.IntervalDtype(subtype='int64') - interval[int64] + >>> pd.IntervalDtype(subtype='int64', closed='both') + interval[int64, both] """ name = "interval" @@ -1008,20 +1008,34 @@ class IntervalDtype(PandasExtensionDtype): str = "|O08" base = np.dtype("O") num = 103 - _metadata = ("subtype",) - _match = re.compile(r"(I|i)nterval\[(?P.+)\]") + _metadata = ( + "subtype", + "closed", + ) + _match = re.compile( + r"(I|i)nterval\[(?P[^,]+)(, (?P(right|left|both|neither)))?\]" + ) _cache: Dict[str_type, PandasExtensionDtype] = {} - def __new__(cls, subtype=None): + def __new__(cls, subtype=None, closed: Optional[str_type] = None): from pandas.core.dtypes.common import is_string_dtype, pandas_dtype + if closed is not None and closed not in {"right", "left", "both", "neither"}: + raise ValueError("closed must be one of 'right', 'left', 'both', 'neither'") + if isinstance(subtype, IntervalDtype): + if closed is not None and closed != subtype.closed: + raise ValueError( + "dtype.closed and 'closed' do not match. " + "Try IntervalDtype(dtype.subtype, closed) instead." + ) return subtype elif subtype is None: # we are called as an empty constructor # generally for pickle compat u = object.__new__(cls) u._subtype = None + u._closed = closed return u elif isinstance(subtype, str) and subtype.lower() == "interval": subtype = None @@ -1029,7 +1043,16 @@ def __new__(cls, subtype=None): if isinstance(subtype, str): m = cls._match.search(subtype) if m is not None: - subtype = m.group("subtype") + gd = m.groupdict() + subtype = gd["subtype"] + if gd.get("closed", None) is not None: + if closed is not None: + if closed != gd["closed"]: + raise ValueError( + "'closed' keyword does not match value " + "specified in dtype string" + ) + closed = gd["closed"] try: subtype = pandas_dtype(subtype) @@ -1044,14 +1067,20 @@ def __new__(cls, subtype=None): ) raise TypeError(msg) + key = str(subtype) + str(closed) try: - return cls._cache[str(subtype)] + return cls._cache[key] except KeyError: u = object.__new__(cls) u._subtype = subtype - cls._cache[str(subtype)] = u + u._closed = closed + cls._cache[key] = u return u + @property + def closed(self): + return self._closed + @property def subtype(self): """ @@ -1101,7 +1130,10 @@ def type(self): def __str__(self) -> str_type: if self.subtype is None: return "interval" - return f"interval[{self.subtype}]" + if self.closed is None: + # Only partially initialized GH#38394 + return f"interval[{self.subtype}]" + return f"interval[{self.subtype}, {self.closed}]" def __hash__(self) -> int: # make myself hashable @@ -1115,6 +1147,8 @@ def __eq__(self, other: Any) -> bool: elif self.subtype is None or other.subtype is None: # None should match any subtype return True + elif self.closed != other.closed: + return False else: from pandas.core.dtypes.common import is_dtype_equal @@ -1126,6 +1160,9 @@ def __setstate__(self, state): # pickle -> need to set the settable private ones here (see GH26067) self._subtype = state["subtype"] + # backward-compat older pickles won't have "closed" key + self._closed = state.pop("closed", None) + @classmethod def is_dtype(cls, dtype: object) -> bool: """ @@ -1174,9 +1211,13 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: if not all(isinstance(x, IntervalDtype) for x in dtypes): return None + closed = cast("IntervalDtype", dtypes[0]).closed + if not all(cast("IntervalDtype", x).closed == closed for x in dtypes): + return np.dtype(object) + from pandas.core.dtypes.cast import find_common_type common = find_common_type([cast("IntervalDtype", x).subtype for x in dtypes]) if common == object: return np.dtype(object) - return IntervalDtype(common) + return IntervalDtype(common, closed=closed) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index aabc3e741641f..b6213e99513c1 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -158,7 +158,7 @@ def wrapped(self, other, sort=False): >>> pd.interval_range(start=0, end=5) IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]], closed='right', - dtype='interval[int64]') + dtype='interval[int64, right]') It may also be constructed using one of the constructor methods: :meth:`IntervalIndex.from_arrays`, @@ -243,7 +243,7 @@ def _simple_new(cls, array: IntervalArray, name: Label = None): >>> pd.IntervalIndex.from_breaks([0, 1, 2, 3]) IntervalIndex([(0, 1], (1, 2], (2, 3]], closed='right', - dtype='interval[int64]') + dtype='interval[int64, right]') """ ), } @@ -274,7 +274,7 @@ def from_breaks( >>> pd.IntervalIndex.from_arrays([0, 1, 2], [1, 2, 3]) IntervalIndex([(0, 1], (1, 2], (2, 3]], closed='right', - dtype='interval[int64]') + dtype='interval[int64, right]') """ ), } @@ -306,7 +306,7 @@ def from_arrays( >>> pd.IntervalIndex.from_tuples([(0, 1), (1, 2)]) IntervalIndex([(0, 1], (1, 2]], closed='right', - dtype='interval[int64]') + dtype='interval[int64, right]') """ ), } @@ -446,7 +446,7 @@ def is_overlapping(self) -> bool: >>> index IntervalIndex([(0, 2], (1, 3], (4, 5]], closed='right', - dtype='interval[int64]') + dtype='interval[int64, right]') >>> index.is_overlapping True @@ -456,7 +456,7 @@ def is_overlapping(self) -> bool: >>> index IntervalIndex([[0, 1], [1, 2], [2, 3]], closed='both', - dtype='interval[int64]') + dtype='interval[int64, both]') >>> index.is_overlapping True @@ -466,7 +466,7 @@ def is_overlapping(self) -> bool: >>> index IntervalIndex([[0, 1), [1, 2), [2, 3)], closed='left', - dtype='interval[int64]') + dtype='interval[int64, left]') >>> index.is_overlapping False """ @@ -1117,7 +1117,8 @@ def interval_range( >>> pd.interval_range(start=0, end=5) IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]], - closed='right', dtype='interval[int64]') + closed='right', + dtype='interval[int64, right]') Additionally, datetime-like input is also supported. @@ -1125,7 +1126,7 @@ def interval_range( ... end=pd.Timestamp('2017-01-04')) IntervalIndex([(2017-01-01, 2017-01-02], (2017-01-02, 2017-01-03], (2017-01-03, 2017-01-04]], - closed='right', dtype='interval[datetime64[ns]]') + closed='right', dtype='interval[datetime64[ns], right]') The ``freq`` parameter specifies the frequency between the left and right. endpoints of the individual intervals within the ``IntervalIndex``. For @@ -1133,7 +1134,7 @@ def interval_range( >>> pd.interval_range(start=0, periods=4, freq=1.5) IntervalIndex([(0.0, 1.5], (1.5, 3.0], (3.0, 4.5], (4.5, 6.0]], - closed='right', dtype='interval[float64]') + closed='right', dtype='interval[float64, right]') Similarly, for datetime-like ``start`` and ``end``, the frequency must be convertible to a DateOffset. @@ -1142,7 +1143,7 @@ def interval_range( ... periods=3, freq='MS') IntervalIndex([(2017-01-01, 2017-02-01], (2017-02-01, 2017-03-01], (2017-03-01, 2017-04-01]], - closed='right', dtype='interval[datetime64[ns]]') + closed='right', dtype='interval[datetime64[ns], right]') Specify ``start``, ``end``, and ``periods``; the frequency is generated automatically (linearly spaced). @@ -1150,14 +1151,14 @@ def interval_range( >>> pd.interval_range(start=0, end=6, periods=4) IntervalIndex([(0.0, 1.5], (1.5, 3.0], (3.0, 4.5], (4.5, 6.0]], closed='right', - dtype='interval[float64]') + dtype='interval[float64, right]') The ``closed`` parameter specifies which endpoints of the individual intervals within the ``IntervalIndex`` are closed. >>> pd.interval_range(end=5, periods=4, closed='both') IntervalIndex([[1, 2], [2, 3], [3, 4], [4, 5]], - closed='both', dtype='interval[int64]') + closed='both', dtype='interval[int64, both]') """ start = maybe_box_datetimelike(start) end = maybe_box_datetimelike(end) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 4c5347bd16e8b..969b416669023 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -135,12 +135,12 @@ def cut( >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3) ... # doctest: +ELLIPSIS [(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ... - Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ... + Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] ... >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True) ... # doctest: +ELLIPSIS ([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ... - Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ... + Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] ... array([0.994, 3. , 5. , 7. ])) Discovers the same bins, but assign them specific labels. Notice that @@ -176,7 +176,7 @@ def cut( d (7.333, 10.0] e (7.333, 10.0] dtype: category - Categories (3, interval[float64]): [(1.992, 4.667] < (4.667, ... + Categories (3, interval[float64, right]): [(1.992, 4.667] < (4.667, ... Passing a Series as an input returns a Series with mapping value. It is used to map numerically to intervals based on bins. @@ -214,7 +214,7 @@ def cut( >>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)]) >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins) [NaN, (0.0, 1.0], NaN, (2.0, 3.0], (4.0, 5.0]] - Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]] + Categories (3, interval[int64, right]): [(0, 1] < (2, 3] < (4, 5]] """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 @@ -336,7 +336,7 @@ def qcut( >>> pd.qcut(range(5), 4) ... # doctest: +ELLIPSIS [(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]] - Categories (4, interval[float64]): [(-0.001, 1.0] < (1.0, 2.0] ... + Categories (4, interval[float64, right]): [(-0.001, 1.0] < (1.0, 2.0] ... >>> pd.qcut(range(5), 3, labels=["good", "medium", "bad"]) ... # doctest: +SKIP diff --git a/pandas/tests/arithmetic/test_interval.py b/pandas/tests/arithmetic/test_interval.py index 6dc3b3b13dd0c..46db9100b8b93 100644 --- a/pandas/tests/arithmetic/test_interval.py +++ b/pandas/tests/arithmetic/test_interval.py @@ -133,7 +133,7 @@ def test_compare_scalar_na(self, op, array, nulls_fixture, request): result = op(array, nulls_fixture) expected = self.elementwise_comparison(op, array, nulls_fixture) - if nulls_fixture is pd.NA and array.dtype != pd.IntervalDtype("int64"): + if nulls_fixture is pd.NA and array.dtype.subtype != "int64": mark = pytest.mark.xfail( reason="broken for non-integer IntervalArray; see GH 31882" ) diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index e5ccb51ce36f5..af291ca98a91a 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -131,7 +131,7 @@ def test_repr(): expected = ( "\n" "[(0, 1], (1, 2]]\n" - "Length: 2, closed: right, dtype: interval[int64]" + "Length: 2, closed: right, dtype: interval[int64, right]" ) assert result == expected diff --git a/pandas/tests/dtypes/cast/test_find_common_type.py b/pandas/tests/dtypes/cast/test_find_common_type.py index 6043deec573f8..50f7c7c2e085a 100644 --- a/pandas/tests/dtypes/cast/test_find_common_type.py +++ b/pandas/tests/dtypes/cast/test_find_common_type.py @@ -130,12 +130,12 @@ def test_period_dtype_mismatch(dtype2): interval_dtypes = [ - IntervalDtype(np.int64), - IntervalDtype(np.float64), - IntervalDtype(np.uint64), - IntervalDtype(DatetimeTZDtype(unit="ns", tz="US/Eastern")), - IntervalDtype("M8[ns]"), - IntervalDtype("m8[ns]"), + IntervalDtype(np.int64, "right"), + IntervalDtype(np.float64, "right"), + IntervalDtype(np.uint64, "right"), + IntervalDtype(DatetimeTZDtype(unit="ns", tz="US/Eastern"), "right"), + IntervalDtype("M8[ns]", "right"), + IntervalDtype("m8[ns]", "right"), ] @@ -151,7 +151,7 @@ def test_interval_dtype(left, right): # i.e. numeric if right.subtype.kind in ["i", "u", "f"]: # both numeric -> common numeric subtype - expected = IntervalDtype(np.float64) + expected = IntervalDtype(np.float64, "right") assert result == expected else: assert result == object diff --git a/pandas/tests/dtypes/cast/test_infer_dtype.py b/pandas/tests/dtypes/cast/test_infer_dtype.py index eaf00718f4c91..a47c5555d3e9f 100644 --- a/pandas/tests/dtypes/cast/test_infer_dtype.py +++ b/pandas/tests/dtypes/cast/test_infer_dtype.py @@ -128,7 +128,7 @@ def test_infer_from_interval(left, right, subtype, closed, pandas_dtype): # GH 30337 interval = Interval(left, right, closed) result_dtype, result_value = infer_dtype_from_scalar(interval, pandas_dtype) - expected_dtype = f"interval[{subtype}]" if pandas_dtype else np.object_ + expected_dtype = f"interval[{subtype}, {closed}]" if pandas_dtype else np.object_ assert result_dtype == expected_dtype assert result_value == interval diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 410731820dc73..c0a2a0c3a9897 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -533,11 +533,11 @@ def dtype(self): """ Class level fixture of dtype for TestIntervalDtype """ - return IntervalDtype("int64") + return IntervalDtype("int64", "right") def test_hash_vs_equality(self, dtype): # make sure that we satisfy is semantics - dtype2 = IntervalDtype("int64") + dtype2 = IntervalDtype("int64", "right") dtype3 = IntervalDtype(dtype2) assert dtype == dtype2 assert dtype2 == dtype @@ -565,10 +565,24 @@ def test_hash_vs_equality(self, dtype): "subtype", ["interval[int64]", "Interval[int64]", "int64", np.dtype("int64")] ) def test_construction(self, subtype): - i = IntervalDtype(subtype) + i = IntervalDtype(subtype, closed="right") assert i.subtype == np.dtype("int64") assert is_interval_dtype(i) + @pytest.mark.parametrize( + "subtype", ["interval[int64]", "Interval[int64]", "int64", np.dtype("int64")] + ) + def test_construction_allows_closed_none(self, subtype): + # GH#38394 + dtype = IntervalDtype(subtype) + + assert dtype.closed is None + + def test_closed_mismatch(self): + msg = "'closed' keyword does not match value specified in dtype string" + with pytest.raises(ValueError, match=msg): + IntervalDtype("interval[int64, left]", "right") + @pytest.mark.parametrize("subtype", [None, "interval", "Interval"]) def test_construction_generic(self, subtype): # generic @@ -603,10 +617,22 @@ def test_construction_errors(self, subtype): with pytest.raises(TypeError, match=msg): IntervalDtype(subtype) + def test_closed_must_match(self): + # GH#37933 + dtype = IntervalDtype(np.float64, "left") + + msg = "dtype.closed and 'closed' do not match" + with pytest.raises(ValueError, match=msg): + IntervalDtype(dtype, closed="both") + + def test_closed_invalid(self): + with pytest.raises(ValueError, match="closed must be one of"): + IntervalDtype(np.float64, "foo") + def test_construction_from_string(self, dtype): - result = IntervalDtype("interval[int64]") + result = IntervalDtype("interval[int64, right]") assert is_dtype_equal(dtype, result) - result = IntervalDtype.construct_from_string("interval[int64]") + result = IntervalDtype.construct_from_string("interval[int64, right]") assert is_dtype_equal(dtype, result) @pytest.mark.parametrize("string", [0, 3.14, ("a", "b"), None]) @@ -630,8 +656,8 @@ def test_construction_from_string_error_subtype(self, string): IntervalDtype.construct_from_string(string) def test_subclass(self): - a = IntervalDtype("interval[int64]") - b = IntervalDtype("interval[int64]") + a = IntervalDtype("interval[int64, right]") + b = IntervalDtype("interval[int64, right]") assert issubclass(type(a), type(a)) assert issubclass(type(a), type(b)) @@ -642,6 +668,9 @@ def test_is_dtype(self, dtype): assert IntervalDtype.is_dtype(IntervalDtype("float64")) assert IntervalDtype.is_dtype(IntervalDtype("int64")) assert IntervalDtype.is_dtype(IntervalDtype(np.int64)) + assert IntervalDtype.is_dtype(IntervalDtype("float64", "left")) + assert IntervalDtype.is_dtype(IntervalDtype("int64", "right")) + assert IntervalDtype.is_dtype(IntervalDtype(np.int64, "both")) assert not IntervalDtype.is_dtype("D") assert not IntervalDtype.is_dtype("3D") @@ -654,16 +683,29 @@ def test_is_dtype(self, dtype): assert not IntervalDtype.is_dtype(np.float64) def test_equality(self, dtype): - assert is_dtype_equal(dtype, "interval[int64]") - assert is_dtype_equal(dtype, IntervalDtype("int64")) - assert is_dtype_equal(IntervalDtype("int64"), IntervalDtype("int64")) + assert is_dtype_equal(dtype, "interval[int64, right]") + assert is_dtype_equal(dtype, IntervalDtype("int64", "right")) + assert is_dtype_equal( + IntervalDtype("int64", "right"), IntervalDtype("int64", "right") + ) + + assert not is_dtype_equal(dtype, "interval[int64]") + assert not is_dtype_equal(dtype, IntervalDtype("int64")) + assert not is_dtype_equal( + IntervalDtype("int64", "right"), IntervalDtype("int64") + ) assert not is_dtype_equal(dtype, "int64") - assert not is_dtype_equal(IntervalDtype("int64"), IntervalDtype("float64")) + assert not is_dtype_equal( + IntervalDtype("int64", "neither"), IntervalDtype("float64", "right") + ) + assert not is_dtype_equal( + IntervalDtype("int64", "both"), IntervalDtype("int64", "left") + ) # invalid subtype comparisons do not raise when directly compared - dtype1 = IntervalDtype("float64") - dtype2 = IntervalDtype("datetime64[ns, US/Eastern]") + dtype1 = IntervalDtype("float64", "left") + dtype2 = IntervalDtype("datetime64[ns, US/Eastern]", "left") assert dtype1 != dtype2 assert dtype2 != dtype1 @@ -684,7 +726,8 @@ def test_equality(self, dtype): ) def test_equality_generic(self, subtype): # GH 18980 - dtype = IntervalDtype(subtype) + closed = "right" if subtype is not None else None + dtype = IntervalDtype(subtype, closed=closed) assert is_dtype_equal(dtype, "interval") assert is_dtype_equal(dtype, IntervalDtype()) @@ -702,8 +745,9 @@ def test_equality_generic(self, subtype): ) def test_name_repr(self, subtype): # GH 18980 - dtype = IntervalDtype(subtype) - expected = f"interval[{subtype}]" + closed = "right" if subtype is not None else None + dtype = IntervalDtype(subtype, closed=closed) + expected = f"interval[{subtype}, {closed}]" assert str(dtype) == expected assert dtype.name == "interval" @@ -728,7 +772,7 @@ def test_basic(self, dtype): assert is_interval_dtype(s) def test_basic_dtype(self): - assert is_interval_dtype("interval[int64]") + assert is_interval_dtype("interval[int64, both]") assert is_interval_dtype(IntervalIndex.from_tuples([(0, 1)])) assert is_interval_dtype(IntervalIndex.from_breaks(np.arange(4))) assert is_interval_dtype( @@ -743,7 +787,7 @@ def test_basic_dtype(self): def test_caching(self): IntervalDtype.reset_cache() - dtype = IntervalDtype("int64") + dtype = IntervalDtype("int64", "right") assert len(IntervalDtype._cache) == 1 IntervalDtype("interval") @@ -757,6 +801,14 @@ def test_not_string(self): # GH30568: though IntervalDtype has object kind, it cannot be string assert not is_string_dtype(IntervalDtype()) + def test_unpickling_without_closed(self): + # GH#38394 + dtype = IntervalDtype("interval") + + assert dtype._closed is None + + tm.round_trip_pickle(dtype) + class TestCategoricalDtypeParametrized: @pytest.mark.parametrize( @@ -976,8 +1028,8 @@ def test_registry(dtype): [ ("int64", None), ("interval", IntervalDtype()), - ("interval[int64]", IntervalDtype()), - ("interval[datetime64[ns]]", IntervalDtype("datetime64[ns]")), + ("interval[int64, neither]", IntervalDtype()), + ("interval[datetime64[ns], left]", IntervalDtype("datetime64[ns]", "left")), ("period[D]", PeriodDtype("D")), ("category", CategoricalDtype()), ("datetime64[ns, US/Eastern]", DatetimeTZDtype("ns", "US/Eastern")), diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index a838b09b39be6..295c8a27e6ddd 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -182,7 +182,7 @@ def test_setitem_dict_preserves_dtypes(self): "obj,dtype", [ (Period("2020-01"), PeriodDtype("M")), - (Interval(left=0, right=5), IntervalDtype("int64")), + (Interval(left=0, right=5), IntervalDtype("int64", "right")), ( Timestamp("2011-01-01", tz="US/Eastern"), DatetimeTZDtype(tz="US/Eastern"), diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 889bd98d6d85a..6a0f86e133752 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -728,7 +728,7 @@ def test_constructor_dict_extension_scalar(self, ea_scalar_and_dtype): "data,dtype", [ (Period("2020-01"), PeriodDtype("M")), - (Interval(left=0, right=5), IntervalDtype("int64")), + (Interval(left=0, right=5), IntervalDtype("int64", "right")), ( Timestamp("2011-01-01", tz="US/Eastern"), DatetimeTZDtype(tz="US/Eastern"), diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index 216d37a381c32..3f04f0f1163e7 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -160,13 +160,17 @@ def test_transform_broadcast(tsframe, ts): def test_transform_axis_1(request, transformation_func): # GH 36308 + warn = None if transformation_func == "tshift": + warn = FutureWarning + request.node.add_marker(pytest.mark.xfail(reason="tshift is deprecated")) args = ("ffill",) if transformation_func == "fillna" else () df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"]) - result = df.groupby([0, 0, 1], axis=1).transform(transformation_func, *args) - expected = df.T.groupby([0, 0, 1]).transform(transformation_func, *args).T + with tm.assert_produces_warning(warn): + result = df.groupby([0, 0, 1], axis=1).transform(transformation_func, *args) + expected = df.T.groupby([0, 0, 1]).transform(transformation_func, *args).T if transformation_func == "diff": # Result contains nans, so transpose coerces to float diff --git a/pandas/tests/indexes/interval/test_astype.py b/pandas/tests/indexes/interval/test_astype.py index 840e3f641e8ba..c269d6ff11896 100644 --- a/pandas/tests/indexes/interval/test_astype.py +++ b/pandas/tests/indexes/interval/test_astype.py @@ -90,7 +90,7 @@ def index(self, request): "subtype", ["float64", "datetime64[ns]", "timedelta64[ns]"] ) def test_subtype_conversion(self, index, subtype): - dtype = IntervalDtype(subtype) + dtype = IntervalDtype(subtype, index.closed) result = index.astype(dtype) expected = IntervalIndex.from_arrays( index.left.astype(subtype), index.right.astype(subtype), closed=index.closed @@ -102,7 +102,7 @@ def test_subtype_conversion(self, index, subtype): ) def test_subtype_integer(self, subtype_start, subtype_end): index = IntervalIndex.from_breaks(np.arange(100, dtype=subtype_start)) - dtype = IntervalDtype(subtype_end) + dtype = IntervalDtype(subtype_end, index.closed) result = index.astype(dtype) expected = IntervalIndex.from_arrays( index.left.astype(subtype_end), @@ -115,7 +115,7 @@ def test_subtype_integer(self, subtype_start, subtype_end): def test_subtype_integer_errors(self): # int64 -> uint64 fails with negative values index = interval_range(-10, 10) - dtype = IntervalDtype("uint64") + dtype = IntervalDtype("uint64", "right") # Until we decide what the exception message _should_ be, we # assert something that it should _not_ be. @@ -143,7 +143,7 @@ def index(self, request): @pytest.mark.parametrize("subtype", ["int64", "uint64"]) def test_subtype_integer(self, subtype): index = interval_range(0.0, 10.0) - dtype = IntervalDtype(subtype) + dtype = IntervalDtype(subtype, "right") result = index.astype(dtype) expected = IntervalIndex.from_arrays( index.left.astype(subtype), index.right.astype(subtype), closed=index.closed @@ -158,7 +158,7 @@ def test_subtype_integer(self, subtype): @pytest.mark.parametrize("subtype", ["int64", "uint64"]) def test_subtype_integer_with_non_integer_borders(self, subtype): index = interval_range(0.0, 3.0, freq=0.25) - dtype = IntervalDtype(subtype) + dtype = IntervalDtype(subtype, "right") result = index.astype(dtype) expected = IntervalIndex.from_arrays( index.left.astype(subtype), index.right.astype(subtype), closed=index.closed @@ -168,17 +168,17 @@ def test_subtype_integer_with_non_integer_borders(self, subtype): def test_subtype_integer_errors(self): # float64 -> uint64 fails with negative values index = interval_range(-10.0, 10.0) - dtype = IntervalDtype("uint64") + dtype = IntervalDtype("uint64", "right") msg = re.escape( - "Cannot convert interval[float64] to interval[uint64]; subtypes are " - "incompatible" + "Cannot convert interval[float64, right] to interval[uint64, right]; " + "subtypes are incompatible" ) with pytest.raises(TypeError, match=msg): index.astype(dtype) @pytest.mark.parametrize("subtype", ["datetime64[ns]", "timedelta64[ns]"]) def test_subtype_datetimelike(self, index, subtype): - dtype = IntervalDtype(subtype) + dtype = IntervalDtype(subtype, "right") msg = "Cannot convert .* to .*; subtypes are incompatible" with pytest.raises(TypeError, match=msg): index.astype(dtype) @@ -201,7 +201,7 @@ def index(self, request): @pytest.mark.parametrize("subtype", ["int64", "uint64"]) def test_subtype_integer(self, index, subtype): - dtype = IntervalDtype(subtype) + dtype = IntervalDtype(subtype, "right") with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = index.astype(dtype) expected = IntervalIndex.from_arrays( @@ -212,14 +212,14 @@ def test_subtype_integer(self, index, subtype): tm.assert_index_equal(result, expected) def test_subtype_float(self, index): - dtype = IntervalDtype("float64") + dtype = IntervalDtype("float64", "right") msg = "Cannot convert .* to .*; subtypes are incompatible" with pytest.raises(TypeError, match=msg): index.astype(dtype) def test_subtype_datetimelike(self): # datetime -> timedelta raises - dtype = IntervalDtype("timedelta64[ns]") + dtype = IntervalDtype("timedelta64[ns]", "right") msg = "Cannot convert .* to .*; subtypes are incompatible" index = interval_range(Timestamp("2018-01-01"), periods=10) @@ -231,7 +231,7 @@ def test_subtype_datetimelike(self): index.astype(dtype) # timedelta -> datetime raises - dtype = IntervalDtype("datetime64[ns]") + dtype = IntervalDtype("datetime64[ns]", "right") index = interval_range(Timedelta("0 days"), periods=10) with pytest.raises(TypeError, match=msg): index.astype(dtype) diff --git a/pandas/tests/indexes/interval/test_constructors.py b/pandas/tests/indexes/interval/test_constructors.py index 409b9419cc464..e3b41e6c5d6bb 100644 --- a/pandas/tests/indexes/interval/test_constructors.py +++ b/pandas/tests/indexes/interval/test_constructors.py @@ -82,13 +82,41 @@ def test_constructor_dtype(self, constructor, breaks, subtype): expected = constructor(**expected_kwargs) result_kwargs = self.get_kwargs_from_breaks(breaks) - iv_dtype = IntervalDtype(subtype) + iv_dtype = IntervalDtype(subtype, "right") for dtype in (iv_dtype, str(iv_dtype)): with tm.assert_produces_warning(warn, check_stacklevel=False): result = constructor(dtype=dtype, **result_kwargs) tm.assert_index_equal(result, expected) + @pytest.mark.parametrize( + "breaks", + [ + Int64Index([0, 1, 2, 3, 4]), + Int64Index([0, 1, 2, 3, 4]), + Int64Index([0, 1, 2, 3, 4]), + Float64Index([0, 1, 2, 3, 4]), + date_range("2017-01-01", periods=5), + timedelta_range("1 day", periods=5), + ], + ) + def test_constructor_pass_closed(self, constructor, breaks): + # not passing closed to IntervalDtype, but to IntervalArray constructor + warn = None + if isinstance(constructor, partial) and constructor.func is Index: + # passing kwargs to Index is deprecated + warn = FutureWarning + + iv_dtype = IntervalDtype(breaks.dtype) + + result_kwargs = self.get_kwargs_from_breaks(breaks) + + for dtype in (iv_dtype, str(iv_dtype)): + with tm.assert_produces_warning(warn, check_stacklevel=False): + + result = constructor(dtype=dtype, closed="left", **result_kwargs) + assert result.dtype.closed == "left" + @pytest.mark.filterwarnings("ignore:Passing keywords other:FutureWarning") @pytest.mark.parametrize("breaks", [[np.nan] * 2, [np.nan] * 4, [np.nan] * 50]) def test_constructor_nan(self, constructor, breaks, closed): @@ -165,7 +193,7 @@ def test_generic_errors(self, constructor): filler = self.get_kwargs_from_breaks(range(10)) # invalid closed - msg = "invalid option for 'closed': invalid" + msg = "closed must be one of 'right', 'left', 'both', 'neither'" with pytest.raises(ValueError, match=msg): constructor(closed="invalid", **filler) @@ -439,3 +467,16 @@ def test_index_mixed_closed(self): result = Index(intervals) expected = Index(intervals, dtype=object) tm.assert_index_equal(result, expected) + + +def test_dtype_closed_mismatch(): + # GH#38394 closed specified in both dtype and IntervalIndex constructor + + dtype = IntervalDtype(np.int64, "left") + + msg = "closed keyword does not match dtype.closed" + with pytest.raises(ValueError, match=msg): + IntervalIndex([], dtype=dtype, closed="neither") + + with pytest.raises(ValueError, match=msg): + IntervalArray([], dtype=dtype, closed="neither") diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index b8734ce8950f2..02ef3cb0e2afb 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -585,7 +585,7 @@ def test_comparison(self): msg = "|".join( [ "not supported between instances of 'int' and '.*.Interval'", - r"Invalid comparison between dtype=interval\[int64\] and ", + r"Invalid comparison between dtype=interval\[int64, right\] and ", ] ) with pytest.raises(TypeError, match=msg): @@ -694,13 +694,13 @@ def test_append(self, closed): ) tm.assert_index_equal(result, expected) - msg = "Intervals must all be closed on the same side" for other_closed in {"left", "right", "both", "neither"} - {closed}: index_other_closed = IntervalIndex.from_arrays( [0, 1], [1, 2], closed=other_closed ) - with pytest.raises(ValueError, match=msg): - index1.append(index_other_closed) + result = index1.append(index_other_closed) + expected = index1.astype(object).append(index_other_closed.astype(object)) + tm.assert_index_equal(result, expected) def test_is_non_overlapping_monotonic(self, closed): # Should be True in all cases diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index 920182a99e9ef..8c1a674b705d5 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -141,7 +141,7 @@ ( pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]), None, - pd.IntervalDtype("int64"), + pd.IntervalDtype("int64", "right"), {}, ), ] diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 558653775236f..9350a3becb3d9 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1010,7 +1010,7 @@ def test_construction_interval(self, interval_constructor): # construction from interval & array of intervals intervals = interval_constructor.from_breaks(np.arange(3), closed="right") result = Series(intervals) - assert result.dtype == "interval[int64]" + assert result.dtype == "interval[int64, right]" tm.assert_index_equal(Index(result.values), Index(intervals)) @pytest.mark.parametrize( @@ -1021,7 +1021,7 @@ def test_constructor_infer_interval(self, data_constructor): data = [Interval(0, 1), Interval(0, 2), None] result = Series(data_constructor(data)) expected = Series(IntervalArray(data)) - assert result.dtype == "interval[float64]" + assert result.dtype == "interval[float64, right]" tm.assert_series_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index 8034ace479a62..bb721afda2b8b 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -254,7 +254,7 @@ def test_assert_frame_equal_interval_dtype_mismatch(): "Attributes of DataFrame\\.iloc\\[:, 0\\] " '\\(column name="a"\\) are different\n\n' 'Attribute "dtype" are different\n' - "\\[left\\]: interval\\[int64\\]\n" + "\\[left\\]: interval\\[int64, right\\]\n" "\\[right\\]: object" ) diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 6bf461d849b5e..df1853ffd26ae 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -253,7 +253,7 @@ def test_assert_series_equal_interval_dtype_mismatch(): msg = """Attributes of Series are different Attribute "dtype" are different -\\[left\\]: interval\\[int64\\] +\\[left\\]: interval\\[int64, right\\] \\[right\\]: object""" tm.assert_series_equal(left, right, check_dtype=False)