From 7295e4f992ea988a6723b22f4276444984c8ed13 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Wed, 3 Jun 2015 17:49:58 -0400 Subject: [PATCH] BUG: Ensure 'coerce' actually coerces datatypes Changes behavior of convert objects so that passing 'coerce' will ensure that data of the correct type is returned, even if all values are null-types (NaN or NaT). closes #9589 --- pandas/core/common.py | 81 +++++++++++++--------------- pandas/core/groupby.py | 11 ++-- pandas/core/internals.py | 6 ++- pandas/tests/test_groupby.py | 2 +- pandas/tests/test_series.py | 100 +++++++++++++++++++++++++++++------ 5 files changed, 133 insertions(+), 67 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index 1c9326c047a79..ac98f4657ae2e 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -34,7 +34,6 @@ class SettingWithCopyError(ValueError): class SettingWithCopyWarning(Warning): pass - class AmbiguousIndexError(PandasError, KeyError): pass @@ -1894,54 +1893,48 @@ def _possibly_convert_objects(values, convert_dates=True, if not hasattr(values, 'dtype'): values = np.array([values], dtype=np.object_) - # convert dates - if convert_dates and values.dtype == np.object_: - - # we take an aggressive stance and convert to datetime64[ns] - if convert_dates == 'coerce': - new_values = _possibly_cast_to_datetime( - values, 'M8[ns]', coerce=True) - - # if we are all nans then leave me alone - if not isnull(new_values).all(): - values = new_values - - else: - values = lib.maybe_convert_objects( - values, convert_datetime=convert_dates) + # If not object, do not attempt conversion + if not is_object_dtype(values.dtype): + return values - # convert timedeltas - if convert_timedeltas and values.dtype == np.object_: + # If 1 flag is coerce, ensure 2 others are False + conversions = (convert_dates, convert_numeric, convert_timedeltas) + if 'coerce' in conversions: + coerce_count = sum([c == 'coerce' for c in conversions]) + if coerce_count > 1: + raise ValueError("'coerce' can be used at most once.") - if convert_timedeltas == 'coerce': + # Immediate return if coerce + if convert_dates == 'coerce': + return _possibly_cast_to_datetime(values, 'M8[ns]', coerce=True) + elif convert_timedeltas == 'coerce': from pandas.tseries.timedeltas import to_timedelta - values = to_timedelta(values, coerce=True) - - # if we are all nans then leave me alone - if not isnull(new_values).all(): - values = new_values - - else: - values = lib.maybe_convert_objects( - values, convert_timedelta=convert_timedeltas) - + return to_timedelta(values, coerce=True, box=False) + elif convert_numeric == 'coerce': + return lib.maybe_convert_numeric(values, set(), coerce_numeric=True) + + # Soft conversions + if convert_dates: + values = lib.maybe_convert_objects(values, + convert_datetime=convert_dates) + + if convert_timedeltas and is_object_dtype(values.dtype): + # Object check to ensure only run if previous did not completely + # convert + values = lib.maybe_convert_objects(values, + convert_timedelta=convert_timedeltas) # convert to numeric - if values.dtype == np.object_: - if convert_numeric: - try: - new_values = lib.maybe_convert_numeric( - values, set(), coerce_numeric=True) - - # if we are all nans then leave me alone - if not isnull(new_values).all(): - values = new_values - - except: - pass - else: + if convert_numeric and is_object_dtype(values.dtype): + # Only if previous failed + try: + converted = lib.maybe_convert_numeric(values, + set(), + coerce_numeric=True) + # If all NaNs, then do not-alter + values = converted if not isnull(converted).all() else values - # soft-conversion - values = lib.maybe_convert_objects(values) + except: + pass return values diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 4abdd1112c721..4a33d6d1b0689 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2939,12 +2939,13 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # if we have date/time like in the original, then coerce dates # as we are stacking can easily have object dtypes here - if (self._selected_obj.ndim == 2 - and self._selected_obj.dtypes.isin(_DATELIKE_DTYPES).any()): - cd = 'coerce' + if (self._selected_obj.ndim == 2 and self._selected_obj.dtypes.isin(_DATELIKE_DTYPES).any()): + result = result.convert_objects(convert_dates=False, convert_numeric=True) + date_cols = [col for col, is_date in zip(result, self._selected_obj.dtypes.isin(_DATELIKE_DTYPES)) if is_date] + result[date_cols] = result[date_cols].convert_objects(convert_dates='coerce') else: - cd = True - result = result.convert_objects(convert_dates=cd) + result = result.convert_objects(convert_dates=True) + return self._reindex_output(result) else: diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 3395ea360165e..e177b058974a3 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1484,8 +1484,10 @@ def convert(self, convert_dates=True, convert_numeric=True, convert_timedeltas=T else: values = com._possibly_convert_objects( - self.values.ravel(), convert_dates=convert_dates, - convert_numeric=convert_numeric + self.values.ravel(), + convert_dates=convert_dates, + convert_numeric=convert_numeric, + convert_timedeltas=convert_timedeltas ).reshape(self.values.shape) blocks.append(make_block(values, ndim=self.ndim, placement=self.mgr_locs)) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index ab78bd63a7c94..7a98ad32077d0 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -599,7 +599,7 @@ def f(grp): return grp.iloc[0] result = df.groupby('A').apply(f)[['C']] e = df.groupby('A').first()[['C']] - e.loc['Pony'] = np.nan + e.loc['Pony'] = pd.NaT assert_frame_equal(result,e) # scalar outputs diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index eb583f17f3ace..0adad6b3a5f6a 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -5830,6 +5830,71 @@ def test_apply_dont_convert_dtype(self): self.assertEqual(result.dtype, object) def test_convert_objects(self): + # Tests: All to nans, coerce, true + # Test coercion returns correct type + s = Series(['a', 'b', 'c']) + results = s.convert_objects('coerce', False, False) + expected = Series([lib.NaT] * 3) + assert_series_equal(results, expected) + + results = s.convert_objects(False, 'coerce', False) + expected = Series([np.nan] * 3) + assert_series_equal(results, expected) + + expected = Series([lib.NaT] * 3, dtype=np.dtype('m8[ns]')) + results = s.convert_objects(False, False, 'coerce') + assert_series_equal(results, expected) + + dt = datetime(2001, 1, 1, 0, 0) + td = dt - datetime(2000, 1, 1, 0, 0) + # Test coercion with mixed types + s = Series(['a', '3.1415', dt, td]) + results = s.convert_objects('coerce',False,False) + expected = Series([lib.NaT, lib.NaT, dt, lib.NaT]) + assert_series_equal(results, expected) + + results = s.convert_objects(False, 'coerce',False) + expected = Series([nan, 3.1415, nan, nan]) + assert_series_equal(results, expected) + + results = s.convert_objects(False, False, 'coerce') + expected = Series([lib.NaT, lib.NaT, lib.NaT, td], + dtype=np.dtype('m8[ns]')) + assert_series_equal(results, expected) + + # Test standard conversion returns original + results = s.convert_objects(True, False, False) + assert_series_equal(results, s) + results = s.convert_objects(False, True, False) + expected = Series([nan, 3.1415, nan, nan]) + assert_series_equal(results, expected) + results = s.convert_objects(False, False, True) + assert_series_equal(results, s) + + # test pass-through and non-conversion when other types selected + s = Series(['1.0','2.0','3.0']) + results = s.convert_objects(True,True,True) + expected = Series([1.0,2.0,3.0]) + assert_series_equal(results, expected) + results = s.convert_objects(True,False,True) + assert_series_equal(results, s) + + s = Series([datetime(2001, 1, 1, 0, 0),datetime(2001, 1, 1, 0, 0)], + dtype='O') + results = s.convert_objects(True,True,True) + expected = Series([datetime(2001, 1, 1, 0, 0),datetime(2001, 1, 1, 0, 0)]) + assert_series_equal(results, expected) + results = s.convert_objects(False,True,True) + assert_series_equal(results, s) + + td = datetime(2001, 1, 1, 0, 0) - datetime(2000, 1, 1, 0, 0) + s = Series([td, td], dtype='O') + results = s.convert_objects(True,True,True) + expected = Series([td, td]) + assert_series_equal(results, expected) + results = s.convert_objects(True,True,False) + assert_series_equal(results, s) + s = Series([1., 2, 3], index=['a', 'b', 'c']) result = s.convert_objects(convert_dates=False, convert_numeric=True) @@ -5848,20 +5913,19 @@ def test_convert_objects(self): r = s.copy().astype('O') r['a'] = 'garbled' - expected = s.copy() - expected['a'] = np.nan result = r.convert_objects(convert_dates=False, convert_numeric=True) + expected = s.copy() + expected['a'] = nan assert_series_equal(result, expected) # GH 4119, not converting a mixed type (e.g.floats and object) s = Series([1, 'na', 3, 4]) result = s.convert_objects(convert_numeric=True) - expected = Series([1, np.nan, 3, 4]) + expected = Series([1, nan, 3, 4]) assert_series_equal(result, expected) s = Series([1, '', 3, 4]) result = s.convert_objects(convert_numeric=True) - expected = Series([1, np.nan, 3, 4]) assert_series_equal(result, expected) # dates @@ -5885,23 +5949,28 @@ def test_convert_objects(self): [Timestamp( '20010101'), Timestamp('20010102'), Timestamp('20010103'), lib.NaT, lib.NaT, lib.NaT, Timestamp('20010104'), Timestamp('20010105')], dtype='M8[ns]') - result = s2.convert_objects( - convert_dates='coerce', convert_numeric=False) + result = s2.convert_objects(convert_dates='coerce', + convert_numeric=False, + convert_timedeltas=False) assert_series_equal(result, expected) - result = s2.convert_objects( - convert_dates='coerce', convert_numeric=True) + result = s2.convert_objects(convert_dates='coerce', + convert_numeric=False, + convert_timedeltas=False) assert_series_equal(result, expected) # preserver all-nans (if convert_dates='coerce') s = Series(['foo', 'bar', 1, 1.0], dtype='O') - result = s.convert_objects( - convert_dates='coerce', convert_numeric=False) - assert_series_equal(result, s) + result = s.convert_objects(convert_dates='coerce', + convert_numeric=False, + convert_timedeltas=False) + expected = Series([lib.NaT]*4) + assert_series_equal(result, expected) # preserver if non-object s = Series([1], dtype='float32') - result = s.convert_objects( - convert_dates='coerce', convert_numeric=False) + result = s.convert_objects(convert_dates='coerce', + convert_numeric=False, + convert_timedeltas=False) assert_series_equal(result, s) #r = s.copy() @@ -5910,13 +5979,14 @@ def test_convert_objects(self): #self.assertEqual(result.dtype, 'M8[ns]') # dateutil parses some single letters into today's value as a date + expected = Series([lib.NaT]) for x in 'abcdefghijklmnopqrstuvwxyz': s = Series([x]) result = s.convert_objects(convert_dates='coerce') - assert_series_equal(result, s) + assert_series_equal(result, expected) s = Series([x.upper()]) result = s.convert_objects(convert_dates='coerce') - assert_series_equal(result, s) + assert_series_equal(result, expected) def test_convert_objects_preserve_bool(self): s = Series([1, True, 3, 5], dtype=object)