BUG: Assorted DatetimeIndex bugfixes (#24157)

pandas-dev · Dec 14, 2018 · a1cf3f6 · a1cf3f6
1 parent 5cedb39
commit a1cf3f6
Show file tree

Hide file tree

Showing 10 changed files with 109 additions and 29 deletions.
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -380,6 +380,7 @@ Backwards incompatible API changes
 - ``max_rows`` and ``max_cols`` parameters removed from :class:`HTMLFormatter` since truncation is handled by :class:`DataFrameFormatter` (:issue:`23818`)
 - :meth:`read_csv` will now raise a ``ValueError`` if a column with missing values is declared as having dtype ``bool`` (:issue:`20591`)
 - The column order of the resultant :class:`DataFrame` from :meth:`MultiIndex.to_frame` is now guaranteed to match the :attr:`MultiIndex.names` order. (:issue:`22420`)
+- :func:`pd.offsets.generate_range` argument ``time_rule`` has been removed; use ``offset`` instead (:issue:`24157`)
 
 Percentage change on groupby changes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -1133,7 +1134,6 @@ Deprecations
 - In :meth:`Series.where` with Categorical data, providing an ``other`` that is not present in the categories is deprecated. Convert the categorical to a different dtype or add the ``other`` to the categories first (:issue:`24077`).
 - :meth:`Series.clip_lower`, :meth:`Series.clip_upper`, :meth:`DataFrame.clip_lower` and :meth:`DataFrame.clip_upper` are deprecated and will be removed in a future version. Use ``Series.clip(lower=threshold)``, ``Series.clip(upper=threshold)`` and the equivalent ``DataFrame`` methods (:issue:`24203`)
 
-
 .. _whatsnew_0240.deprecations.datetimelike_int_ops:
 
 Integer Addition/Subtraction with Datetime-like Classes Is Deprecated
@@ -1310,6 +1310,9 @@ Datetimelike
 - Bug in :class:`Index` where calling ``np.array(dtindex, dtype=object)`` on a timezone-naive :class:`DatetimeIndex` would return an array of ``datetime`` objects instead of :class:`Timestamp` objects, potentially losing nanosecond portions of the timestamps (:issue:`23524`)
 - Bug in :class:`Categorical.__setitem__` not allowing setting with another ``Categorical`` when both are undordered and have the same categories, but in a different order (:issue:`24142`)
 - Bug in :func:`date_range` where using dates with millisecond resolution or higher could return incorrect values or the wrong number of values in the index (:issue:`24110`)
+- Bug in :class:`DatetimeIndex` where constructing a :class:`DatetimeIndex` from a :class:`Categorical` or :class:`CategoricalIndex` would incorrectly drop timezone information (:issue:`18664`)
+- Bug in :class:`DatetimeIndex` and :class:`TimedeltaIndex` where indexing with ``Ellipsis`` would incorrectly lose the index's ``freq`` attribute (:issue:`21282`)
+- Clarified error message produced when passing an incorrect ``freq`` argument to :class:`DatetimeIndex` with ``NaT`` as the first entry in the passed data (:issue:`11587`)
 
 Timedelta
 ^^^^^^^^^
@@ -1422,6 +1425,7 @@ Indexing
 - Bug in :func:`Index.union` and :func:`Index.intersection` where name of the ``Index`` of the result was not computed correctly for certain cases (:issue:`9943`, :issue:`9862`)
 - Bug in :class:`Index` slicing with boolean :class:`Index` may raise ``TypeError`` (:issue:`22533`)
 - Bug in ``PeriodArray.__setitem__`` when accepting slice and list-like value (:issue:`23978`)
+- Bug in :class:`DatetimeIndex`, :class:`TimedeltaIndex` where indexing with ``Ellipsis`` would lose their ``freq`` attribute (:issue:`21282`)
 
 Missing
 ^^^^^^^

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -354,6 +354,10 @@ def __getitem__(self, key):
                     freq = key.step * self.freq
                 else:
                     freq = self.freq
+            elif key is Ellipsis:
+                # GH#21282 indexing with Ellipsis is similar to a full slice,
+                #  should preserve `freq` attribute
+                freq = self.freq
 
         attribs['freq'] = freq
 
@@ -550,9 +554,22 @@ def _validate_frequency(cls, index, freq, **kwargs):
         if index.size == 0 or inferred == freq.freqstr:
             return None
 
-        on_freq = cls._generate_range(start=index[0], end=None,
-                                      periods=len(index), freq=freq, **kwargs)
-        if not np.array_equal(index.asi8, on_freq.asi8):
+        try:
+            on_freq = cls._generate_range(start=index[0], end=None,
+                                          periods=len(index), freq=freq,
+                                          **kwargs)
+            if not np.array_equal(index.asi8, on_freq.asi8):
+                raise ValueError
+        except ValueError as e:
+            if "non-fixed" in str(e):
+                # non-fixed frequencies are not meaningful for timedelta64;
+                #  we retain that error message
+                raise e
+            # GH#11587 the main way this is reached is if the `np.array_equal`
+            #  check above is False.  This can also be reached if index[0]
+            #  is `NaT`, in which case the call to `cls._generate_range` will
+            #  raise a ValueError, which we re-raise with a more targeted
+            #  message.
             raise ValueError('Inferred frequency {infer} from passed values '
                              'does not conform to passed frequency {passed}'
                              .format(infer=inferred, passed=freq.freqstr))

diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -14,9 +14,9 @@
 from pandas.util._decorators import Appender
 
 from pandas.core.dtypes.common import (
-    _INT64_DTYPE, _NS_DTYPE, is_datetime64_dtype, is_datetime64tz_dtype,
-    is_extension_type, is_float_dtype, is_int64_dtype, is_object_dtype,
-    is_period_dtype, is_string_dtype, is_timedelta64_dtype)
+    _INT64_DTYPE, _NS_DTYPE, is_categorical_dtype, is_datetime64_dtype,
+    is_datetime64tz_dtype, is_extension_type, is_float_dtype, is_int64_dtype,
+    is_object_dtype, is_period_dtype, is_string_dtype, is_timedelta64_dtype)
 from pandas.core.dtypes.dtypes import DatetimeTZDtype
 from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries
 from pandas.core.dtypes.missing import isna
@@ -277,6 +277,8 @@ def _generate_range(cls, start, end, periods, freq, tz=None,
             if closed is not None:
                 raise ValueError("Closed has to be None if not both of start"
                                  "and end are defined")
+        if start is NaT or end is NaT:
+            raise ValueError("Neither `start` nor `end` can be NaT")
 
         left_closed, right_closed = dtl.validate_endpoints(closed)
 
@@ -1666,6 +1668,13 @@ def maybe_convert_dtype(data, copy):
         raise TypeError("Passing PeriodDtype data is invalid.  "
                         "Use `data.to_timestamp()` instead")
 
+    elif is_categorical_dtype(data):
+        # GH#18664 preserve tz in going DTI->Categorical->DTI
+        # TODO: cases where we need to do another pass through this func,
+        #  e.g. the categories are timedelta64s
+        data = data.categories.take(data.codes, fill_value=NaT)
+        copy = False
+
     elif is_extension_type(data) and not is_datetime64tz_dtype(data):
         # Includes categorical
         # TODO: We have no tests for these

diff --git a/pandas/tests/indexes/datetimes/test_construction.py b/pandas/tests/indexes/datetimes/test_construction.py
@@ -14,12 +14,42 @@
 from pandas import (
     DatetimeIndex, Index, Timestamp, date_range, datetime, offsets,
     to_datetime)
-from pandas.core.arrays import period_array
+from pandas.core.arrays import (
+    DatetimeArrayMixin as DatetimeArray, period_array)
 import pandas.util.testing as tm
 
 
 class TestDatetimeIndex(object):
 
+    @pytest.mark.parametrize('dt_cls', [DatetimeIndex, DatetimeArray])
+    def test_freq_validation_with_nat(self, dt_cls):
+        # GH#11587 make sure we get a useful error message when generate_range
+        #  raises
+        msg = ("Inferred frequency None from passed values does not conform "
+               "to passed frequency D")
+        with pytest.raises(ValueError, match=msg):
+            dt_cls([pd.NaT, pd.Timestamp('2011-01-01')], freq='D')
+        with pytest.raises(ValueError, match=msg):
+            dt_cls([pd.NaT, pd.Timestamp('2011-01-01').value],
+                   freq='D')
+
+    def test_categorical_preserves_tz(self):
+        # GH#18664 retain tz when going DTI-->Categorical-->DTI
+        # TODO: parametrize over DatetimeIndex/DatetimeArray
+        #  once CategoricalIndex(DTA) works
+
+        dti = pd.DatetimeIndex(
+            [pd.NaT, '2015-01-01', '1999-04-06 15:14:13', '2015-01-01'],
+            tz='US/Eastern')
+
+        ci = pd.CategoricalIndex(dti)
+        carr = pd.Categorical(dti)
+        cser = pd.Series(ci)
+
+        for obj in [ci, carr, cser]:
+            result = pd.DatetimeIndex(obj)
+            tm.assert_index_equal(result, dti)
+
     def test_dti_with_period_data_raises(self):
         # GH#23675
         data = pd.PeriodIndex(['2016Q1', '2016Q2'], freq='Q')

diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py
@@ -80,6 +80,14 @@ def test_date_range_timestamp_equiv_preserve_frequency(self):
 
 
 class TestDateRanges(TestData):
+    def test_date_range_nat(self):
+        # GH#11587
+        msg = "Neither `start` nor `end` can be NaT"
+        with pytest.raises(ValueError, match=msg):
+            date_range(start='2016-01-01', end=pd.NaT, freq='D')
+        with pytest.raises(ValueError, match=msg):
+            date_range(start=pd.NaT, end='2016-01-01', freq='D')
+
     def test_date_range_out_of_bounds(self):
         # GH#14187
         with pytest.raises(OutOfBoundsDatetime):
@@ -533,12 +541,12 @@ class TestGenRangeGeneration(object):
 
     def test_generate(self):
         rng1 = list(generate_range(START, END, offset=BDay()))
-        rng2 = list(generate_range(START, END, time_rule='B'))
+        rng2 = list(generate_range(START, END, offset='B'))
         assert rng1 == rng2
 
     def test_generate_cday(self):
         rng1 = list(generate_range(START, END, offset=CDay()))
-        rng2 = list(generate_range(START, END, time_rule='C'))
+        rng2 = list(generate_range(START, END, offset='C'))
         assert rng1 == rng2
 
     def test_1(self):

diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py
@@ -16,6 +16,15 @@
 
 
 class TestGetItem(object):
+    def test_ellipsis(self):
+        # GH#21282
+        idx = pd.date_range('2011-01-01', '2011-01-31', freq='D',
+                            tz='Asia/Tokyo', name='idx')
+
+        result = idx[...]
+        assert result.equals(idx)
+        assert result is not idx
+
     def test_getitem(self):
         idx1 = pd.date_range('2011-01-01', '2011-01-31', freq='D', name='idx')
         idx2 = pd.date_range('2011-01-01', '2011-01-31', freq='D',

diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py
@@ -13,6 +13,14 @@
 
 
 class TestGetItem(object):
+    def test_ellipsis(self):
+        # GH#21282
+        idx = period_range('2011-01-01', '2011-01-31', freq='D',
+                           name='idx')
+
+        result = idx[...]
+        assert result.equals(idx)
+        assert result is not idx
 
     def test_getitem(self):
         idx1 = pd.period_range('2011-01-01', '2011-01-31', freq='D',

diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py
@@ -9,6 +9,14 @@
 
 
 class TestGetItem(object):
+    def test_ellipsis(self):
+        # GH#21282
+        idx = timedelta_range('1 day', '31 day', freq='D', name='idx')
+
+        result = idx[...]
+        assert result.equals(idx)
+        assert result is not idx
+
     def test_getitem(self):
         idx1 = timedelta_range('1 day', '31 day', freq='D', name='idx')
 

diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py
@@ -53,17 +53,11 @@ def test_to_m8():
     valb = datetime(2007, 10, 1)
     valu = _to_m8(valb)
     assert isinstance(valu, np.datetime64)
-    # assert valu == np.datetime64(datetime(2007,10,1))
 
-    # def test_datetime64_box():
-    #    valu = np.datetime64(datetime(2007,10,1))
-    #    valb = _dt_box(valu)
-    #    assert type(valb) == datetime
-    #    assert valb == datetime(2007,10,1)
 
-    #####
-    # DateOffset Tests
-    #####
+#####
+# DateOffset Tests
+#####
 
 
 class Base(object):

diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py
@@ -2457,8 +2457,7 @@ class Nano(Tick):
 # ---------------------------------------------------------------------
 
 
-def generate_range(start=None, end=None, periods=None,
-                   offset=BDay(), time_rule=None):
+def generate_range(start=None, end=None, periods=None, offset=BDay()):
     """
     Generates a sequence of dates corresponding to the specified time
     offset. Similar to dateutil.rrule except uses pandas DateOffset
@@ -2470,26 +2469,20 @@ def generate_range(start=None, end=None, periods=None,
     end : datetime (default None)
     periods : int, (default None)
     offset : DateOffset, (default BDay())
-    time_rule : (legacy) name of DateOffset object to be used, optional
-        Corresponds with names expected by tseries.frequencies.get_offset
 
     Notes
     -----
     * This method is faster for generating weekdays than dateutil.rrule
     * At least two of (start, end, periods) must be specified.
     * If both start and end are specified, the returned dates will
     satisfy start <= date <= end.
-    * If both time_rule and offset are specified, time_rule supersedes offset.
 
     Returns
     -------
     dates : generator object
-
     """
-    if time_rule is not None:
-        from pandas.tseries.frequencies import get_offset
-
-        offset = get_offset(time_rule)
+    from pandas.tseries.frequencies import to_offset
+    offset = to_offset(offset)
 
     start = to_datetime(start)
     end = to_datetime(end)