From a277e4aec312abe78689ce361025cd60b25cf0c0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 1 Oct 2018 05:12:35 -0700 Subject: [PATCH] BUG: Merge timezone aware data with DST (#22825) --- doc/source/whatsnew/v0.24.0.txt | 1 + pandas/core/indexes/datetimelike.py | 59 ++++++++++++++++-------- pandas/tests/indexing/test_coercion.py | 40 +++++++++++----- pandas/tests/reshape/merge/test_merge.py | 24 ++++++++++ 4 files changed, 93 insertions(+), 31 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 5532771b38a0e..b71edcf1f6f51 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -815,6 +815,7 @@ Reshaping - Bug in :meth:`Series.replace` and meth:`DataFrame.replace` when dict is used as the ``to_replace`` value and one key in the dict is is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`) - Bug in :meth:`DataFrame.drop_duplicates` for empty ``DataFrame`` which incorrectly raises an error (:issue:`20516`) - Bug in :func:`pandas.wide_to_long` when a string is passed to the stubnames argument and a column name is a substring of that stubname (:issue:`22468`) +- Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`) Build Changes ^^^^^^^^^^^^^ diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index f7f4f187f6202..37a12a588db03 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -277,7 +277,7 @@ def _evaluate_compare(self, other, op): except TypeError: return result - def _ensure_localized(self, result, ambiguous='raise'): + def _ensure_localized(self, arg, ambiguous='raise', from_utc=False): """ ensure that we are re-localized @@ -286,9 +286,11 @@ def _ensure_localized(self, result, ambiguous='raise'): Parameters ---------- - result : DatetimeIndex / i8 ndarray - ambiguous : str, bool, or bool-ndarray - default 'raise' + arg : DatetimeIndex / i8 ndarray + ambiguous : str, bool, or bool-ndarray, default 'raise' + from_utc : bool, default False + If True, localize the i8 ndarray to UTC first before converting to + the appropriate tz. If False, localize directly to the tz. Returns ------- @@ -297,10 +299,13 @@ def _ensure_localized(self, result, ambiguous='raise'): # reconvert to local tz if getattr(self, 'tz', None) is not None: - if not isinstance(result, ABCIndexClass): - result = self._simple_new(result) - result = result.tz_localize(self.tz, ambiguous=ambiguous) - return result + if not isinstance(arg, ABCIndexClass): + arg = self._simple_new(arg) + if from_utc: + arg = arg.tz_localize('UTC').tz_convert(self.tz) + else: + arg = arg.tz_localize(self.tz, ambiguous=ambiguous) + return arg def _box_values_as_index(self): """ @@ -622,11 +627,11 @@ def repeat(self, repeats, *args, **kwargs): @Appender(_index_shared_docs['where'] % _index_doc_kwargs) def where(self, cond, other=None): - other = _ensure_datetimelike_to_i8(other) - values = _ensure_datetimelike_to_i8(self) + other = _ensure_datetimelike_to_i8(other, to_utc=True) + values = _ensure_datetimelike_to_i8(self, to_utc=True) result = np.where(cond, values, other).astype('i8') - result = self._ensure_localized(result) + result = self._ensure_localized(result, from_utc=True) return self._shallow_copy(result, **self._get_attributes_dict()) @@ -695,23 +700,37 @@ def astype(self, dtype, copy=True): return super(DatetimeIndexOpsMixin, self).astype(dtype, copy=copy) -def _ensure_datetimelike_to_i8(other): - """ helper for coercing an input scalar or array to i8 """ +def _ensure_datetimelike_to_i8(other, to_utc=False): + """ + helper for coercing an input scalar or array to i8 + + Parameters + ---------- + other : 1d array + to_utc : bool, default False + If True, convert the values to UTC before extracting the i8 values + If False, extract the i8 values directly. + + Returns + ------- + i8 1d array + """ if is_scalar(other) and isna(other): - other = iNaT + return iNaT elif isinstance(other, ABCIndexClass): # convert tz if needed if getattr(other, 'tz', None) is not None: - other = other.tz_localize(None).asi8 - else: - other = other.asi8 + if to_utc: + other = other.tz_convert('UTC') + else: + other = other.tz_localize(None) else: try: - other = np.array(other, copy=False).view('i8') + return np.array(other, copy=False).view('i8') except TypeError: # period array cannot be coerces to int - other = Index(other).asi8 - return other + other = Index(other) + return other.asi8 def wrap_arithmetic_op(self, other, result): diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index e7daefffe5f6f..2f44cb36eeb11 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -590,11 +590,9 @@ def test_where_series_datetime64(self, fill_val, exp_dtype): pd.Timestamp('2011-01-03'), values[3]]) self._assert_where_conversion(obj, cond, values, exp, exp_dtype) - @pytest.mark.parametrize("fill_val,exp_dtype", [ - (pd.Timestamp('2012-01-01'), 'datetime64[ns]'), - (pd.Timestamp('2012-01-01', tz='US/Eastern'), np.object)], - ids=['datetime64', 'datetime64tz']) - def test_where_index_datetime(self, fill_val, exp_dtype): + def test_where_index_datetime(self): + fill_val = pd.Timestamp('2012-01-01') + exp_dtype = 'datetime64[ns]' obj = pd.Index([pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02'), pd.Timestamp('2011-01-03'), @@ -613,13 +611,33 @@ def test_where_index_datetime(self, fill_val, exp_dtype): pd.Timestamp('2011-01-03'), pd.Timestamp('2012-01-04')]) - if fill_val.tz: - self._assert_where_conversion(obj, cond, values, exp, - 'datetime64[ns]') - pytest.xfail("ToDo: do not ignore timezone, must be object") self._assert_where_conversion(obj, cond, values, exp, exp_dtype) - pytest.xfail("datetime64 + datetime64 -> datetime64 must support" - " scalar") + + @pytest.mark.xfail( + reason="GH 22839: do not ignore timezone, must be object") + def test_where_index_datetimetz(self): + fill_val = pd.Timestamp('2012-01-01', tz='US/Eastern') + exp_dtype = np.object + obj = pd.Index([pd.Timestamp('2011-01-01'), + pd.Timestamp('2011-01-02'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2011-01-04')]) + assert obj.dtype == 'datetime64[ns]' + cond = pd.Index([True, False, True, False]) + + msg = ("Index\\(\\.\\.\\.\\) must be called with a collection " + "of some kind") + with tm.assert_raises_regex(TypeError, msg): + obj.where(cond, fill_val) + + values = pd.Index(pd.date_range(fill_val, periods=4)) + exp = pd.Index([pd.Timestamp('2011-01-01'), + pd.Timestamp('2012-01-02', tz='US/Eastern'), + pd.Timestamp('2011-01-03'), + pd.Timestamp('2012-01-04', tz='US/Eastern')], + dtype=exp_dtype) + + self._assert_where_conversion(obj, cond, values, exp, exp_dtype) def test_where_index_complex128(self): pass diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 42df4511578f1..50ef622a4147f 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -601,6 +601,30 @@ def test_merge_on_datetime64tz(self): assert result['value_x'].dtype == 'datetime64[ns, US/Eastern]' assert result['value_y'].dtype == 'datetime64[ns, US/Eastern]' + def test_merge_datetime64tz_with_dst_transition(self): + # GH 18885 + df1 = pd.DataFrame(pd.date_range( + '2017-10-29 01:00', periods=4, freq='H', tz='Europe/Madrid'), + columns=['date']) + df1['value'] = 1 + df2 = pd.DataFrame({ + 'date': pd.to_datetime([ + '2017-10-29 03:00:00', '2017-10-29 04:00:00', + '2017-10-29 05:00:00' + ]), + 'value': 2 + }) + df2['date'] = df2['date'].dt.tz_localize('UTC').dt.tz_convert( + 'Europe/Madrid') + result = pd.merge(df1, df2, how='outer', on='date') + expected = pd.DataFrame({ + 'date': pd.date_range( + '2017-10-29 01:00', periods=7, freq='H', tz='Europe/Madrid'), + 'value_x': [1] * 4 + [np.nan] * 3, + 'value_y': [np.nan] * 4 + [2] * 3 + }) + assert_frame_equal(result, expected) + def test_merge_non_unique_period_index(self): # GH #16871 index = pd.period_range('2016-01-01', periods=16, freq='M')