Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Groupby.agg with reduction function with tz aware data #25308

Merged
merged 22 commits into from
Feb 28, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
46943e0
BUG: Groupby.agg cannot reduce with tz aware data
Feb 12, 2019
724a69e
Handle output always as UTC
Feb 13, 2019
54a5b7c
Merge remote-tracking branch 'upstream/master' into timezone_agg
Feb 13, 2019
0d1eb55
Add whatsnew
Feb 13, 2019
b4913dc
isort and add another fixed groupby.first/last issue
Feb 13, 2019
2aa3a2c
Merge remote-tracking branch 'upstream/master' into timezone_agg
Feb 14, 2019
18198f4
Merge remote-tracking branch 'upstream/master' into timezone_agg
Feb 14, 2019
c970848
Merge remote-tracking branch 'upstream/master' into timezone_agg
Feb 16, 2019
da26b7b
bring condition at a higher level
Feb 16, 2019
1295d4f
Merge remote-tracking branch 'upstream/master' into timezone_agg
Feb 17, 2019
60adcf0
Add try for _try_cast
Feb 18, 2019
e9ac6e8
Merge remote-tracking branch 'upstream/master' into timezone_agg
Feb 19, 2019
619f071
Merge remote-tracking branch 'upstream/master' into timezone_agg
Feb 19, 2019
206303a
Add comments
Feb 19, 2019
4ac569c
Merge remote-tracking branch 'upstream/master' into timezone_agg
Feb 21, 2019
1339a45
Merge remote-tracking branch 'upstream/master' into timezone_agg
Feb 23, 2019
0c0b43a
Don't pass the utc_dtype explicitly
Feb 24, 2019
5bf07a9
Remove unused import
Feb 24, 2019
95a48d6
Use string dtype instead
Feb 26, 2019
7feb4a7
Merge remote-tracking branch 'upstream/master' into timezone_agg
Feb 27, 2019
fd1a2ec
Merge remote-tracking branch 'upstream/master' into timezone_agg
Feb 28, 2019
3cfe961
Merge remote-tracking branch 'upstream/master' into timezone_agg
Feb 28, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -229,8 +229,8 @@ Groupby/Resample/Rolling

- Bug in :meth:`pandas.core.resample.Resampler.agg` with a timezone aware index where ``OverflowError`` would raise when passing a list of functions (:issue:`22660`)
- Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.nunique` in which the names of column levels were lost (:issue:`23222`)
-
-
- Bug in :func:`pandas.core.groupby.GroupBy.agg` when applying a aggregation function to timezone aware data (:issue:`23683`)
- Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` where timezone information would be dropped (:issue:`21603`)


Reshaping
Expand Down
4 changes: 3 additions & 1 deletion pandas/_libs/reduction.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,9 @@ cdef class SeriesGrouper:
index = None
else:
values = dummy.values
if dummy.dtype != self.arr.dtype:
# GH 23683: datetimetz types are equivalent to datetime types here
if (dummy.dtype != self.arr.dtype
and values.dtype != self.arr.dtype):
raise ValueError('Dummy array must be same dtype')
if not values.flags.contiguous:
values = values.copy()
Expand Down
19 changes: 17 additions & 2 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ class providing the base-class of operations.

from pandas.core.dtypes.cast import maybe_downcast_to_dtype
from pandas.core.dtypes.common import (
ensure_float, is_extension_array_dtype, is_numeric_dtype, is_scalar)
ensure_float, is_datetime64tz_dtype, is_extension_array_dtype,
is_numeric_dtype, is_scalar)
from pandas.core.dtypes.missing import isna, notna

from pandas.api.types import (
Expand Down Expand Up @@ -766,7 +767,21 @@ def _try_cast(self, result, obj, numeric_only=False):
dtype = obj.dtype

if not is_scalar(result):
if is_extension_array_dtype(dtype):
if is_datetime64tz_dtype(dtype):
# GH 23683
# Prior results _may_ have been generated in UTC.
# Ensure we localize to UTC first before converting
# to the target timezone
try:
result = obj._values._from_sequence(
result, dtype='datetime64[ns, UTC]'
)
result = result.astype(dtype)
except TypeError:
jreback marked this conversation as resolved.
Show resolved Hide resolved
jreback marked this conversation as resolved.
Show resolved Hide resolved
# _try_cast was called at a point where the result
# was already tz-aware
pass
elif is_extension_array_dtype(dtype):
# The function can return something of any type, so check
# if the type is compatible with the calling EA.
try:
Expand Down
15 changes: 15 additions & 0 deletions pandas/tests/groupby/aggregate/test_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,3 +512,18 @@ def test_agg_list_like_func():
expected = pd.DataFrame({'A': [str(x) for x in range(3)],
'B': [[str(x)] for x in range(3)]})
tm.assert_frame_equal(result, expected)


def test_agg_lambda_with_timezone():
# GH 23683
df = pd.DataFrame({
'tag': [1, 1],
'date': [
pd.Timestamp('2018-01-01', tz='UTC'),
pd.Timestamp('2018-01-02', tz='UTC')]
})
result = df.groupby('tag').agg({'date': lambda e: e.head(1)})
expected = pd.DataFrame([pd.Timestamp('2018-01-01', tz='UTC')],
index=pd.Index([1], name='tag'),
columns=['date'])
tm.assert_frame_equal(result, expected)
20 changes: 20 additions & 0 deletions pandas/tests/groupby/test_nth.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,26 @@ def test_first_last_tz(data, expected_first, expected_last):
assert_frame_equal(result, expected[['id', 'time']])


@pytest.mark.parametrize('method, ts, alpha', [
['first', Timestamp('2013-01-01', tz='US/Eastern'), 'a'],
['last', Timestamp('2013-01-02', tz='US/Eastern'), 'b']
])
def test_first_last_tz_multi_column(method, ts, alpha):
# GH 21603
df = pd.DataFrame({'group': [1, 1, 2],
'category_string': pd.Series(list('abc')).astype(
'category'),
'datetimetz': pd.date_range('20130101', periods=3,
tz='US/Eastern')})
result = getattr(df.groupby('group'), method)()
expepcted = pd.DataFrame({'category_string': [alpha, 'c'],
'datetimetz': [ts,
Timestamp('2013-01-03',
tz='US/Eastern')]},
index=pd.Index([1, 2], name='group'))
assert_frame_equal(result, expepcted)


def test_nth_multi_index_as_expected():
# PR 9090, related to issue 8979
# test nth on MultiIndex
Expand Down