From 367e389b8d7226469a2577522bea9dfef217103d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 1 Jan 2019 08:16:22 -0800 Subject: [PATCH] Implement unique+array parts of 24024 (#24527) --- doc/source/whatsnew/v0.24.0.rst | 25 +++++++++++++++++++++ pandas/arrays/__init__.py | 6 ++++- pandas/core/algorithms.py | 11 +++------- pandas/core/base.py | 20 ++++++++++++----- pandas/core/series.py | 39 +++++++++++++++++++-------------- pandas/tests/test_algos.py | 15 ++++++++----- pandas/tests/test_base.py | 31 ++++++++++++++++++-------- 7 files changed, 100 insertions(+), 47 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index dc10f25fbd9d6c..8c686db22299b2 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -673,6 +673,31 @@ is the case with :attr:`Period.end_time`, for example p.end_time +.. _whatsnew_0240.api_breaking.datetime_unique: + +The return type of :meth:`Series.unique` for datetime with timezone values has changed +from an :class:`numpy.ndarray` of :class:`Timestamp` objects to a :class:`arrays.DatetimeArray` (:issue:`24024`). + +.. ipython:: python + + ser = pd.Series([pd.Timestamp('2000', tz='UTC'), + pd.Timestamp('2000', tz='UTC')]) + +*Previous Behavior*: + +.. code-block:: ipython + + In [3]: ser.unique() + Out[3]: array([Timestamp('2000-01-01 00:00:00+0000', tz='UTC')], dtype=object) + + +*New Behavior*: + +.. ipython:: python + + ser.unique() + + .. _whatsnew_0240.api_breaking.sparse_values: Sparse Data Structure Refactor diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py index 1a7d5821be0cb1..5433d11eccff92 100644 --- a/pandas/arrays/__init__.py +++ b/pandas/arrays/__init__.py @@ -5,15 +5,19 @@ """ from pandas.core.arrays import ( IntervalArray, PeriodArray, Categorical, SparseArray, IntegerArray, - PandasArray + PandasArray, + DatetimeArrayMixin as DatetimeArray, + TimedeltaArrayMixin as TimedeltaArray, ) __all__ = [ 'Categorical', + 'DatetimeArray', 'IntegerArray', 'IntervalArray', 'PandasArray', 'PeriodArray', 'SparseArray', + 'TimedeltaArray', ] diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 3c4fe519e41818..8d85b84ec75071 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -350,6 +350,9 @@ def unique(values): if is_extension_array_dtype(values): # Dispatch to extension dtype's unique. return values.unique() + elif is_datetime64tz_dtype(values): + # TODO: merge this check into the previous one following #24024 + return values.unique() original = values htable, _, values, dtype, ndtype = _get_hashtable_algo(values) @@ -357,14 +360,6 @@ def unique(values): table = htable(len(values)) uniques = table.unique(values) uniques = _reconstruct_data(uniques, dtype, original) - - if isinstance(original, ABCSeries) and is_datetime64tz_dtype(dtype): - # we are special casing datetime64tz_dtype - # to return an object array of tz-aware Timestamps - - # TODO: it must return DatetimeArray with tz in pandas 2.0 - uniques = uniques.astype(object).values - return uniques diff --git a/pandas/core/base.py b/pandas/core/base.py index 8af4b59c4634bd..cc1bda620c215c 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -15,8 +15,9 @@ from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( - is_datetime64tz_dtype, is_datetimelike, is_extension_array_dtype, - is_extension_type, is_list_like, is_object_dtype, is_scalar) + is_datetime64_ns_dtype, is_datetime64tz_dtype, is_datetimelike, + is_extension_array_dtype, is_extension_type, is_list_like, is_object_dtype, + is_scalar, is_timedelta64_ns_dtype) from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna @@ -849,12 +850,19 @@ def array(self): """ result = self._values - # TODO(DatetimeArray): remvoe the second clause. - if (not is_extension_array_dtype(result.dtype) - and not is_datetime64tz_dtype(result.dtype)): - from pandas.core.arrays.numpy_ import PandasArray + if (is_datetime64_ns_dtype(result.dtype) or + is_datetime64tz_dtype(result.dtype)): + from pandas.arrays import DatetimeArray + result = DatetimeArray(result) + + elif is_timedelta64_ns_dtype(result.dtype): + from pandas.arrays import TimedeltaArray + result = TimedeltaArray(result) + elif not is_extension_array_dtype(result.dtype): + from pandas.core.arrays.numpy_ import PandasArray result = PandasArray(result) + return result def to_numpy(self, dtype=None, copy=False): diff --git a/pandas/core/series.py b/pandas/core/series.py index c4f61ccf830d4f..672fa2edb00bae 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -17,10 +17,9 @@ from pandas.core.dtypes.common import ( _is_unorderable_exception, ensure_platform_int, is_bool, - is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_datetimelike, is_dict_like, is_extension_array_dtype, is_extension_type, - is_hashable, is_integer, is_iterator, is_list_like, is_scalar, - is_string_like, is_timedelta64_dtype) + is_categorical_dtype, is_datetime64_dtype, is_datetimelike, is_dict_like, + is_extension_array_dtype, is_extension_type, is_hashable, is_integer, + is_iterator, is_list_like, is_scalar, is_string_like, is_timedelta64_dtype) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCDatetimeIndex, ABCSeries, ABCSparseArray, ABCSparseSeries) from pandas.core.dtypes.missing import ( @@ -1556,9 +1555,18 @@ def unique(self): Returns ------- - ndarray or Categorical - The unique values returned as a NumPy array. In case of categorical - data type, returned as a Categorical. + ndarray or ExtensionArray + The unique values returned as a NumPy array. In case of an + extension-array backed Series, a new + :class:`~api.extensions.ExtensionArray` of that type with just + the unique values is returned. This includes + + * Categorical + * Period + * Datetime with Timezone + * Interval + * Sparse + * IntegerNA See Also -------- @@ -1575,8 +1583,9 @@ def unique(self): >>> pd.Series([pd.Timestamp('2016-01-01', tz='US/Eastern') ... for _ in range(3)]).unique() - array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')], - dtype=object) + + ['2016-01-01 00:00:00-05:00'] + Length: 1, dtype: datetime64[ns, US/Eastern] An unordered Categorical will return categories in the order of appearance. @@ -1593,14 +1602,10 @@ def unique(self): Categories (3, object): [a < b < c] """ result = super(Series, self).unique() - - if is_datetime64tz_dtype(self.dtype): - # we are special casing datetime64tz_dtype - # to return an object array of tz-aware Timestamps - - # TODO: it must return DatetimeArray with tz in pandas 2.0 - result = result.astype(object).values - + if isinstance(result, DatetimeIndex): + # TODO: This should be unnecessary after Series._values returns + # DatetimeArray + result = result._eadata return result def drop_duplicates(self, keep='first', inplace=False): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index c9d403f6696af1..8d7fd6449b354c 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -16,6 +16,7 @@ from pandas._libs import (groupby as libgroupby, algos as libalgos, hashtable as ht) from pandas.compat import lrange, range +from pandas.core.arrays import DatetimeArrayMixin as DatetimeArray import pandas.core.algorithms as algos import pandas.core.common as com import pandas.util.testing as tm @@ -456,9 +457,10 @@ def test_datetime64tz_aware(self): result = Series( Index([Timestamp('20160101', tz='US/Eastern'), Timestamp('20160101', tz='US/Eastern')])).unique() - expected = np.array([Timestamp('2016-01-01 00:00:00-0500', - tz='US/Eastern')], dtype=object) - tm.assert_numpy_array_equal(result, expected) + expected = DatetimeArray._from_sequence(np.array([ + Timestamp('2016-01-01 00:00:00-0500', tz="US/Eastern") + ])) + tm.assert_extension_array_equal(result, expected) result = Index([Timestamp('20160101', tz='US/Eastern'), Timestamp('20160101', tz='US/Eastern')]).unique() @@ -469,9 +471,10 @@ def test_datetime64tz_aware(self): result = pd.unique( Series(Index([Timestamp('20160101', tz='US/Eastern'), Timestamp('20160101', tz='US/Eastern')]))) - expected = np.array([Timestamp('2016-01-01 00:00:00-0500', - tz='US/Eastern')], dtype=object) - tm.assert_numpy_array_equal(result, expected) + expected = DatetimeArray._from_sequence(np.array([ + Timestamp('2016-01-01', tz="US/Eastern"), + ])) + tm.assert_extension_array_equal(result, expected) result = pd.unique(Index([Timestamp('20160101', tz='US/Eastern'), Timestamp('20160101', tz='US/Eastern')])) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 44fd64e9fc78c0..50db4f67cc3cff 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -11,11 +11,15 @@ import pandas.compat as compat from pandas.core.dtypes.common import ( is_object_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - needs_i8_conversion) + needs_i8_conversion, is_timedelta64_dtype) import pandas.util.testing as tm from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex, PeriodIndex, Timedelta, IntervalIndex, Interval, CategoricalIndex, Timestamp, DataFrame, Panel) +from pandas.core.arrays import ( + DatetimeArrayMixin as DatetimeArray, + TimedeltaArrayMixin as TimedeltaArray, +) from pandas.compat import StringIO, PYPY, long from pandas.compat.numpy import np_array_datetime64_compat from pandas.core.arrays import PandasArray @@ -383,8 +387,12 @@ def test_value_counts_unique_nunique(self): assert result[0] == orig[0] for r in result: assert isinstance(r, Timestamp) - tm.assert_numpy_array_equal(result, - orig._values.astype(object).values) + + # TODO(#24024) once orig._values returns DTA, remove + # the `._eadata` below + tm.assert_numpy_array_equal( + result.astype(object), + orig._values._eadata.astype(object)) else: tm.assert_numpy_array_equal(result, orig.values) @@ -410,7 +418,9 @@ def test_value_counts_unique_nunique_null(self): else: o = o.copy() o[0:2] = iNaT - values = o._values + # TODO(#24024) once Series._values returns DTA, remove + # the `._eadata` here + values = o._values._eadata elif needs_i8_conversion(o): values[0:2] = iNaT @@ -431,7 +441,7 @@ def test_value_counts_unique_nunique_null(self): o = klass(values.repeat(range(1, len(o) + 1))) o.name = 'a' else: - if is_datetime64tz_dtype(o): + if isinstance(o, DatetimeIndex): expected_index = orig._values._shallow_copy(values) else: expected_index = Index(values) @@ -472,8 +482,7 @@ def test_value_counts_unique_nunique_null(self): Index(values[1:], name='a')) elif is_datetime64tz_dtype(o): # unable to compare NaT / nan - vals = values[2:].astype(object).values - tm.assert_numpy_array_equal(result[1:], vals) + tm.assert_extension_array_equal(result[1:], values[2:]) assert result[0] is pd.NaT else: tm.assert_numpy_array_equal(result[1:], values[2:]) @@ -1187,7 +1196,6 @@ def test_ndarray_values(array, expected): @pytest.mark.parametrize("arr", [ np.array([1, 2, 3]), - np.array([1, 2, 3], dtype="datetime64[ns]"), ]) def test_numpy_array(arr): ser = pd.Series(arr) @@ -1199,7 +1207,12 @@ def test_numpy_array(arr): def test_numpy_array_all_dtypes(any_numpy_dtype): ser = pd.Series(dtype=any_numpy_dtype) result = ser.array - assert isinstance(result, PandasArray) + if is_datetime64_dtype(any_numpy_dtype): + assert isinstance(result, DatetimeArray) + elif is_timedelta64_dtype(any_numpy_dtype): + assert isinstance(result, TimedeltaArray) + else: + assert isinstance(result, PandasArray) @pytest.mark.parametrize("array, attr", [