Skip to content

Commit

Permalink
Implement unique+array parts of 24024 (pandas-dev#24527)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored and Pingviinituutti committed Feb 28, 2019
1 parent b6b343d commit 367e389
Show file tree
Hide file tree
Showing 7 changed files with 100 additions and 47 deletions.
25 changes: 25 additions & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -673,6 +673,31 @@ is the case with :attr:`Period.end_time`, for example
p.end_time
.. _whatsnew_0240.api_breaking.datetime_unique:

The return type of :meth:`Series.unique` for datetime with timezone values has changed
from an :class:`numpy.ndarray` of :class:`Timestamp` objects to a :class:`arrays.DatetimeArray` (:issue:`24024`).

.. ipython:: python
ser = pd.Series([pd.Timestamp('2000', tz='UTC'),
pd.Timestamp('2000', tz='UTC')])
*Previous Behavior*:

.. code-block:: ipython
In [3]: ser.unique()
Out[3]: array([Timestamp('2000-01-01 00:00:00+0000', tz='UTC')], dtype=object)
*New Behavior*:

.. ipython:: python
ser.unique()
.. _whatsnew_0240.api_breaking.sparse_values:

Sparse Data Structure Refactor
Expand Down
6 changes: 5 additions & 1 deletion pandas/arrays/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,19 @@
"""
from pandas.core.arrays import (
IntervalArray, PeriodArray, Categorical, SparseArray, IntegerArray,
PandasArray
PandasArray,
DatetimeArrayMixin as DatetimeArray,
TimedeltaArrayMixin as TimedeltaArray,
)


__all__ = [
'Categorical',
'DatetimeArray',
'IntegerArray',
'IntervalArray',
'PandasArray',
'PeriodArray',
'SparseArray',
'TimedeltaArray',
]
11 changes: 3 additions & 8 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,21 +350,16 @@ def unique(values):
if is_extension_array_dtype(values):
# Dispatch to extension dtype's unique.
return values.unique()
elif is_datetime64tz_dtype(values):
# TODO: merge this check into the previous one following #24024
return values.unique()

original = values
htable, _, values, dtype, ndtype = _get_hashtable_algo(values)

table = htable(len(values))
uniques = table.unique(values)
uniques = _reconstruct_data(uniques, dtype, original)

if isinstance(original, ABCSeries) and is_datetime64tz_dtype(dtype):
# we are special casing datetime64tz_dtype
# to return an object array of tz-aware Timestamps

# TODO: it must return DatetimeArray with tz in pandas 2.0
uniques = uniques.astype(object).values

return uniques


Expand Down
20 changes: 14 additions & 6 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@
from pandas.util._validators import validate_bool_kwarg

from pandas.core.dtypes.common import (
is_datetime64tz_dtype, is_datetimelike, is_extension_array_dtype,
is_extension_type, is_list_like, is_object_dtype, is_scalar)
is_datetime64_ns_dtype, is_datetime64tz_dtype, is_datetimelike,
is_extension_array_dtype, is_extension_type, is_list_like, is_object_dtype,
is_scalar, is_timedelta64_ns_dtype)
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
from pandas.core.dtypes.missing import isna

Expand Down Expand Up @@ -849,12 +850,19 @@ def array(self):
"""
result = self._values

# TODO(DatetimeArray): remvoe the second clause.
if (not is_extension_array_dtype(result.dtype)
and not is_datetime64tz_dtype(result.dtype)):
from pandas.core.arrays.numpy_ import PandasArray
if (is_datetime64_ns_dtype(result.dtype) or
is_datetime64tz_dtype(result.dtype)):
from pandas.arrays import DatetimeArray
result = DatetimeArray(result)

elif is_timedelta64_ns_dtype(result.dtype):
from pandas.arrays import TimedeltaArray
result = TimedeltaArray(result)

elif not is_extension_array_dtype(result.dtype):
from pandas.core.arrays.numpy_ import PandasArray
result = PandasArray(result)

return result

def to_numpy(self, dtype=None, copy=False):
Expand Down
39 changes: 22 additions & 17 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,9 @@

from pandas.core.dtypes.common import (
_is_unorderable_exception, ensure_platform_int, is_bool,
is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype,
is_datetimelike, is_dict_like, is_extension_array_dtype, is_extension_type,
is_hashable, is_integer, is_iterator, is_list_like, is_scalar,
is_string_like, is_timedelta64_dtype)
is_categorical_dtype, is_datetime64_dtype, is_datetimelike, is_dict_like,
is_extension_array_dtype, is_extension_type, is_hashable, is_integer,
is_iterator, is_list_like, is_scalar, is_string_like, is_timedelta64_dtype)
from pandas.core.dtypes.generic import (
ABCDataFrame, ABCDatetimeIndex, ABCSeries, ABCSparseArray, ABCSparseSeries)
from pandas.core.dtypes.missing import (
Expand Down Expand Up @@ -1556,9 +1555,18 @@ def unique(self):
Returns
-------
ndarray or Categorical
The unique values returned as a NumPy array. In case of categorical
data type, returned as a Categorical.
ndarray or ExtensionArray
The unique values returned as a NumPy array. In case of an
extension-array backed Series, a new
:class:`~api.extensions.ExtensionArray` of that type with just
the unique values is returned. This includes
* Categorical
* Period
* Datetime with Timezone
* Interval
* Sparse
* IntegerNA
See Also
--------
Expand All @@ -1575,8 +1583,9 @@ def unique(self):
>>> pd.Series([pd.Timestamp('2016-01-01', tz='US/Eastern')
... for _ in range(3)]).unique()
array([Timestamp('2016-01-01 00:00:00-0500', tz='US/Eastern')],
dtype=object)
<DatetimeArrayMixin>
['2016-01-01 00:00:00-05:00']
Length: 1, dtype: datetime64[ns, US/Eastern]
An unordered Categorical will return categories in the order of
appearance.
Expand All @@ -1593,14 +1602,10 @@ def unique(self):
Categories (3, object): [a < b < c]
"""
result = super(Series, self).unique()

if is_datetime64tz_dtype(self.dtype):
# we are special casing datetime64tz_dtype
# to return an object array of tz-aware Timestamps

# TODO: it must return DatetimeArray with tz in pandas 2.0
result = result.astype(object).values

if isinstance(result, DatetimeIndex):
# TODO: This should be unnecessary after Series._values returns
# DatetimeArray
result = result._eadata
return result

def drop_duplicates(self, keep='first', inplace=False):
Expand Down
15 changes: 9 additions & 6 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from pandas._libs import (groupby as libgroupby, algos as libalgos,
hashtable as ht)
from pandas.compat import lrange, range
from pandas.core.arrays import DatetimeArrayMixin as DatetimeArray
import pandas.core.algorithms as algos
import pandas.core.common as com
import pandas.util.testing as tm
Expand Down Expand Up @@ -456,9 +457,10 @@ def test_datetime64tz_aware(self):
result = Series(
Index([Timestamp('20160101', tz='US/Eastern'),
Timestamp('20160101', tz='US/Eastern')])).unique()
expected = np.array([Timestamp('2016-01-01 00:00:00-0500',
tz='US/Eastern')], dtype=object)
tm.assert_numpy_array_equal(result, expected)
expected = DatetimeArray._from_sequence(np.array([
Timestamp('2016-01-01 00:00:00-0500', tz="US/Eastern")
]))
tm.assert_extension_array_equal(result, expected)

result = Index([Timestamp('20160101', tz='US/Eastern'),
Timestamp('20160101', tz='US/Eastern')]).unique()
Expand All @@ -469,9 +471,10 @@ def test_datetime64tz_aware(self):
result = pd.unique(
Series(Index([Timestamp('20160101', tz='US/Eastern'),
Timestamp('20160101', tz='US/Eastern')])))
expected = np.array([Timestamp('2016-01-01 00:00:00-0500',
tz='US/Eastern')], dtype=object)
tm.assert_numpy_array_equal(result, expected)
expected = DatetimeArray._from_sequence(np.array([
Timestamp('2016-01-01', tz="US/Eastern"),
]))
tm.assert_extension_array_equal(result, expected)

result = pd.unique(Index([Timestamp('20160101', tz='US/Eastern'),
Timestamp('20160101', tz='US/Eastern')]))
Expand Down
31 changes: 22 additions & 9 deletions pandas/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,15 @@
import pandas.compat as compat
from pandas.core.dtypes.common import (
is_object_dtype, is_datetime64_dtype, is_datetime64tz_dtype,
needs_i8_conversion)
needs_i8_conversion, is_timedelta64_dtype)
import pandas.util.testing as tm
from pandas import (Series, Index, DatetimeIndex, TimedeltaIndex,
PeriodIndex, Timedelta, IntervalIndex, Interval,
CategoricalIndex, Timestamp, DataFrame, Panel)
from pandas.core.arrays import (
DatetimeArrayMixin as DatetimeArray,
TimedeltaArrayMixin as TimedeltaArray,
)
from pandas.compat import StringIO, PYPY, long
from pandas.compat.numpy import np_array_datetime64_compat
from pandas.core.arrays import PandasArray
Expand Down Expand Up @@ -383,8 +387,12 @@ def test_value_counts_unique_nunique(self):
assert result[0] == orig[0]
for r in result:
assert isinstance(r, Timestamp)
tm.assert_numpy_array_equal(result,
orig._values.astype(object).values)

# TODO(#24024) once orig._values returns DTA, remove
# the `._eadata` below
tm.assert_numpy_array_equal(
result.astype(object),
orig._values._eadata.astype(object))
else:
tm.assert_numpy_array_equal(result, orig.values)

Expand All @@ -410,7 +418,9 @@ def test_value_counts_unique_nunique_null(self):
else:
o = o.copy()
o[0:2] = iNaT
values = o._values
# TODO(#24024) once Series._values returns DTA, remove
# the `._eadata` here
values = o._values._eadata

elif needs_i8_conversion(o):
values[0:2] = iNaT
Expand All @@ -431,7 +441,7 @@ def test_value_counts_unique_nunique_null(self):
o = klass(values.repeat(range(1, len(o) + 1)))
o.name = 'a'
else:
if is_datetime64tz_dtype(o):
if isinstance(o, DatetimeIndex):
expected_index = orig._values._shallow_copy(values)
else:
expected_index = Index(values)
Expand Down Expand Up @@ -472,8 +482,7 @@ def test_value_counts_unique_nunique_null(self):
Index(values[1:], name='a'))
elif is_datetime64tz_dtype(o):
# unable to compare NaT / nan
vals = values[2:].astype(object).values
tm.assert_numpy_array_equal(result[1:], vals)
tm.assert_extension_array_equal(result[1:], values[2:])
assert result[0] is pd.NaT
else:
tm.assert_numpy_array_equal(result[1:], values[2:])
Expand Down Expand Up @@ -1187,7 +1196,6 @@ def test_ndarray_values(array, expected):

@pytest.mark.parametrize("arr", [
np.array([1, 2, 3]),
np.array([1, 2, 3], dtype="datetime64[ns]"),
])
def test_numpy_array(arr):
ser = pd.Series(arr)
Expand All @@ -1199,7 +1207,12 @@ def test_numpy_array(arr):
def test_numpy_array_all_dtypes(any_numpy_dtype):
ser = pd.Series(dtype=any_numpy_dtype)
result = ser.array
assert isinstance(result, PandasArray)
if is_datetime64_dtype(any_numpy_dtype):
assert isinstance(result, DatetimeArray)
elif is_timedelta64_dtype(any_numpy_dtype):
assert isinstance(result, TimedeltaArray)
else:
assert isinstance(result, PandasArray)


@pytest.mark.parametrize("array, attr", [
Expand Down

0 comments on commit 367e389

Please sign in to comment.