Skip to content

Commit

Permalink
BUG: Merge timezone aware data with DST (pandas-dev#22825)
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke authored and jreback committed Oct 1, 2018
1 parent f021bbc commit a277e4a
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 31 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -815,6 +815,7 @@ Reshaping
- Bug in :meth:`Series.replace` and meth:`DataFrame.replace` when dict is used as the ``to_replace`` value and one key in the dict is is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`)
- Bug in :meth:`DataFrame.drop_duplicates` for empty ``DataFrame`` which incorrectly raises an error (:issue:`20516`)
- Bug in :func:`pandas.wide_to_long` when a string is passed to the stubnames argument and a column name is a substring of that stubname (:issue:`22468`)
- Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`)

Build Changes
^^^^^^^^^^^^^
Expand Down
59 changes: 39 additions & 20 deletions pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ def _evaluate_compare(self, other, op):
except TypeError:
return result

def _ensure_localized(self, result, ambiguous='raise'):
def _ensure_localized(self, arg, ambiguous='raise', from_utc=False):
"""
ensure that we are re-localized
Expand All @@ -286,9 +286,11 @@ def _ensure_localized(self, result, ambiguous='raise'):
Parameters
----------
result : DatetimeIndex / i8 ndarray
ambiguous : str, bool, or bool-ndarray
default 'raise'
arg : DatetimeIndex / i8 ndarray
ambiguous : str, bool, or bool-ndarray, default 'raise'
from_utc : bool, default False
If True, localize the i8 ndarray to UTC first before converting to
the appropriate tz. If False, localize directly to the tz.
Returns
-------
Expand All @@ -297,10 +299,13 @@ def _ensure_localized(self, result, ambiguous='raise'):

# reconvert to local tz
if getattr(self, 'tz', None) is not None:
if not isinstance(result, ABCIndexClass):
result = self._simple_new(result)
result = result.tz_localize(self.tz, ambiguous=ambiguous)
return result
if not isinstance(arg, ABCIndexClass):
arg = self._simple_new(arg)
if from_utc:
arg = arg.tz_localize('UTC').tz_convert(self.tz)
else:
arg = arg.tz_localize(self.tz, ambiguous=ambiguous)
return arg

def _box_values_as_index(self):
"""
Expand Down Expand Up @@ -622,11 +627,11 @@ def repeat(self, repeats, *args, **kwargs):

@Appender(_index_shared_docs['where'] % _index_doc_kwargs)
def where(self, cond, other=None):
other = _ensure_datetimelike_to_i8(other)
values = _ensure_datetimelike_to_i8(self)
other = _ensure_datetimelike_to_i8(other, to_utc=True)
values = _ensure_datetimelike_to_i8(self, to_utc=True)
result = np.where(cond, values, other).astype('i8')

result = self._ensure_localized(result)
result = self._ensure_localized(result, from_utc=True)
return self._shallow_copy(result,
**self._get_attributes_dict())

Expand Down Expand Up @@ -695,23 +700,37 @@ def astype(self, dtype, copy=True):
return super(DatetimeIndexOpsMixin, self).astype(dtype, copy=copy)


def _ensure_datetimelike_to_i8(other):
""" helper for coercing an input scalar or array to i8 """
def _ensure_datetimelike_to_i8(other, to_utc=False):
"""
helper for coercing an input scalar or array to i8
Parameters
----------
other : 1d array
to_utc : bool, default False
If True, convert the values to UTC before extracting the i8 values
If False, extract the i8 values directly.
Returns
-------
i8 1d array
"""
if is_scalar(other) and isna(other):
other = iNaT
return iNaT
elif isinstance(other, ABCIndexClass):
# convert tz if needed
if getattr(other, 'tz', None) is not None:
other = other.tz_localize(None).asi8
else:
other = other.asi8
if to_utc:
other = other.tz_convert('UTC')
else:
other = other.tz_localize(None)
else:
try:
other = np.array(other, copy=False).view('i8')
return np.array(other, copy=False).view('i8')
except TypeError:
# period array cannot be coerces to int
other = Index(other).asi8
return other
other = Index(other)
return other.asi8


def wrap_arithmetic_op(self, other, result):
Expand Down
40 changes: 29 additions & 11 deletions pandas/tests/indexing/test_coercion.py
Original file line number Diff line number Diff line change
Expand Up @@ -590,11 +590,9 @@ def test_where_series_datetime64(self, fill_val, exp_dtype):
pd.Timestamp('2011-01-03'), values[3]])
self._assert_where_conversion(obj, cond, values, exp, exp_dtype)

@pytest.mark.parametrize("fill_val,exp_dtype", [
(pd.Timestamp('2012-01-01'), 'datetime64[ns]'),
(pd.Timestamp('2012-01-01', tz='US/Eastern'), np.object)],
ids=['datetime64', 'datetime64tz'])
def test_where_index_datetime(self, fill_val, exp_dtype):
def test_where_index_datetime(self):
fill_val = pd.Timestamp('2012-01-01')
exp_dtype = 'datetime64[ns]'
obj = pd.Index([pd.Timestamp('2011-01-01'),
pd.Timestamp('2011-01-02'),
pd.Timestamp('2011-01-03'),
Expand All @@ -613,13 +611,33 @@ def test_where_index_datetime(self, fill_val, exp_dtype):
pd.Timestamp('2011-01-03'),
pd.Timestamp('2012-01-04')])

if fill_val.tz:
self._assert_where_conversion(obj, cond, values, exp,
'datetime64[ns]')
pytest.xfail("ToDo: do not ignore timezone, must be object")
self._assert_where_conversion(obj, cond, values, exp, exp_dtype)
pytest.xfail("datetime64 + datetime64 -> datetime64 must support"
" scalar")

@pytest.mark.xfail(
reason="GH 22839: do not ignore timezone, must be object")
def test_where_index_datetimetz(self):
fill_val = pd.Timestamp('2012-01-01', tz='US/Eastern')
exp_dtype = np.object
obj = pd.Index([pd.Timestamp('2011-01-01'),
pd.Timestamp('2011-01-02'),
pd.Timestamp('2011-01-03'),
pd.Timestamp('2011-01-04')])
assert obj.dtype == 'datetime64[ns]'
cond = pd.Index([True, False, True, False])

msg = ("Index\\(\\.\\.\\.\\) must be called with a collection "
"of some kind")
with tm.assert_raises_regex(TypeError, msg):
obj.where(cond, fill_val)

values = pd.Index(pd.date_range(fill_val, periods=4))
exp = pd.Index([pd.Timestamp('2011-01-01'),
pd.Timestamp('2012-01-02', tz='US/Eastern'),
pd.Timestamp('2011-01-03'),
pd.Timestamp('2012-01-04', tz='US/Eastern')],
dtype=exp_dtype)

self._assert_where_conversion(obj, cond, values, exp, exp_dtype)

def test_where_index_complex128(self):
pass
Expand Down
24 changes: 24 additions & 0 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,30 @@ def test_merge_on_datetime64tz(self):
assert result['value_x'].dtype == 'datetime64[ns, US/Eastern]'
assert result['value_y'].dtype == 'datetime64[ns, US/Eastern]'

def test_merge_datetime64tz_with_dst_transition(self):
# GH 18885
df1 = pd.DataFrame(pd.date_range(
'2017-10-29 01:00', periods=4, freq='H', tz='Europe/Madrid'),
columns=['date'])
df1['value'] = 1
df2 = pd.DataFrame({
'date': pd.to_datetime([
'2017-10-29 03:00:00', '2017-10-29 04:00:00',
'2017-10-29 05:00:00'
]),
'value': 2
})
df2['date'] = df2['date'].dt.tz_localize('UTC').dt.tz_convert(
'Europe/Madrid')
result = pd.merge(df1, df2, how='outer', on='date')
expected = pd.DataFrame({
'date': pd.date_range(
'2017-10-29 01:00', periods=7, freq='H', tz='Europe/Madrid'),
'value_x': [1] * 4 + [np.nan] * 3,
'value_y': [np.nan] * 4 + [2] * 3
})
assert_frame_equal(result, expected)

def test_merge_non_unique_period_index(self):
# GH #16871
index = pd.period_range('2016-01-01', periods=16, freq='M')
Expand Down

0 comments on commit a277e4a

Please sign in to comment.