Skip to content

Commit

Permalink
BUG: Ensure 'coerce' actually coerces datatypes
Browse files Browse the repository at this point in the history
Changes behavior of convert objects so that passing 'coerce' will
ensure that data of the correct type is returned, even if all
values are null-types (NaN or NaT).

closes #9589
  • Loading branch information
Kevin Sheppard committed Jun 4, 2015
1 parent bc7d48f commit 7295e4f
Show file tree
Hide file tree
Showing 5 changed files with 133 additions and 67 deletions.
81 changes: 37 additions & 44 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ class SettingWithCopyError(ValueError):
class SettingWithCopyWarning(Warning):
pass


class AmbiguousIndexError(PandasError, KeyError):
pass

Expand Down Expand Up @@ -1894,54 +1893,48 @@ def _possibly_convert_objects(values, convert_dates=True,
if not hasattr(values, 'dtype'):
values = np.array([values], dtype=np.object_)

# convert dates
if convert_dates and values.dtype == np.object_:

# we take an aggressive stance and convert to datetime64[ns]
if convert_dates == 'coerce':
new_values = _possibly_cast_to_datetime(
values, 'M8[ns]', coerce=True)

# if we are all nans then leave me alone
if not isnull(new_values).all():
values = new_values

else:
values = lib.maybe_convert_objects(
values, convert_datetime=convert_dates)
# If not object, do not attempt conversion
if not is_object_dtype(values.dtype):
return values

# convert timedeltas
if convert_timedeltas and values.dtype == np.object_:
# If 1 flag is coerce, ensure 2 others are False
conversions = (convert_dates, convert_numeric, convert_timedeltas)
if 'coerce' in conversions:
coerce_count = sum([c == 'coerce' for c in conversions])
if coerce_count > 1:
raise ValueError("'coerce' can be used at most once.")

if convert_timedeltas == 'coerce':
# Immediate return if coerce
if convert_dates == 'coerce':
return _possibly_cast_to_datetime(values, 'M8[ns]', coerce=True)
elif convert_timedeltas == 'coerce':
from pandas.tseries.timedeltas import to_timedelta
values = to_timedelta(values, coerce=True)

# if we are all nans then leave me alone
if not isnull(new_values).all():
values = new_values

else:
values = lib.maybe_convert_objects(
values, convert_timedelta=convert_timedeltas)

return to_timedelta(values, coerce=True, box=False)
elif convert_numeric == 'coerce':
return lib.maybe_convert_numeric(values, set(), coerce_numeric=True)

# Soft conversions
if convert_dates:
values = lib.maybe_convert_objects(values,
convert_datetime=convert_dates)

if convert_timedeltas and is_object_dtype(values.dtype):
# Object check to ensure only run if previous did not completely
# convert
values = lib.maybe_convert_objects(values,
convert_timedelta=convert_timedeltas)
# convert to numeric
if values.dtype == np.object_:
if convert_numeric:
try:
new_values = lib.maybe_convert_numeric(
values, set(), coerce_numeric=True)

# if we are all nans then leave me alone
if not isnull(new_values).all():
values = new_values

except:
pass
else:
if convert_numeric and is_object_dtype(values.dtype):
# Only if previous failed
try:
converted = lib.maybe_convert_numeric(values,
set(),
coerce_numeric=True)
# If all NaNs, then do not-alter
values = converted if not isnull(converted).all() else values

# soft-conversion
values = lib.maybe_convert_objects(values)
except:
pass

return values

Expand Down
11 changes: 6 additions & 5 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2939,12 +2939,13 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):

# if we have date/time like in the original, then coerce dates
# as we are stacking can easily have object dtypes here
if (self._selected_obj.ndim == 2
and self._selected_obj.dtypes.isin(_DATELIKE_DTYPES).any()):
cd = 'coerce'
if (self._selected_obj.ndim == 2 and self._selected_obj.dtypes.isin(_DATELIKE_DTYPES).any()):
result = result.convert_objects(convert_dates=False, convert_numeric=True)
date_cols = [col for col, is_date in zip(result, self._selected_obj.dtypes.isin(_DATELIKE_DTYPES)) if is_date]
result[date_cols] = result[date_cols].convert_objects(convert_dates='coerce')
else:
cd = True
result = result.convert_objects(convert_dates=cd)
result = result.convert_objects(convert_dates=True)

return self._reindex_output(result)

else:
Expand Down
6 changes: 4 additions & 2 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1484,8 +1484,10 @@ def convert(self, convert_dates=True, convert_numeric=True, convert_timedeltas=T
else:

values = com._possibly_convert_objects(
self.values.ravel(), convert_dates=convert_dates,
convert_numeric=convert_numeric
self.values.ravel(),
convert_dates=convert_dates,
convert_numeric=convert_numeric,
convert_timedeltas=convert_timedeltas
).reshape(self.values.shape)
blocks.append(make_block(values,
ndim=self.ndim, placement=self.mgr_locs))
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -599,7 +599,7 @@ def f(grp):
return grp.iloc[0]
result = df.groupby('A').apply(f)[['C']]
e = df.groupby('A').first()[['C']]
e.loc['Pony'] = np.nan
e.loc['Pony'] = pd.NaT
assert_frame_equal(result,e)

# scalar outputs
Expand Down
100 changes: 85 additions & 15 deletions pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -5830,6 +5830,71 @@ def test_apply_dont_convert_dtype(self):
self.assertEqual(result.dtype, object)

def test_convert_objects(self):
# Tests: All to nans, coerce, true
# Test coercion returns correct type
s = Series(['a', 'b', 'c'])
results = s.convert_objects('coerce', False, False)
expected = Series([lib.NaT] * 3)
assert_series_equal(results, expected)

results = s.convert_objects(False, 'coerce', False)
expected = Series([np.nan] * 3)
assert_series_equal(results, expected)

expected = Series([lib.NaT] * 3, dtype=np.dtype('m8[ns]'))
results = s.convert_objects(False, False, 'coerce')
assert_series_equal(results, expected)

dt = datetime(2001, 1, 1, 0, 0)
td = dt - datetime(2000, 1, 1, 0, 0)
# Test coercion with mixed types
s = Series(['a', '3.1415', dt, td])
results = s.convert_objects('coerce',False,False)
expected = Series([lib.NaT, lib.NaT, dt, lib.NaT])
assert_series_equal(results, expected)

results = s.convert_objects(False, 'coerce',False)
expected = Series([nan, 3.1415, nan, nan])
assert_series_equal(results, expected)

results = s.convert_objects(False, False, 'coerce')
expected = Series([lib.NaT, lib.NaT, lib.NaT, td],
dtype=np.dtype('m8[ns]'))
assert_series_equal(results, expected)

# Test standard conversion returns original
results = s.convert_objects(True, False, False)
assert_series_equal(results, s)
results = s.convert_objects(False, True, False)
expected = Series([nan, 3.1415, nan, nan])
assert_series_equal(results, expected)
results = s.convert_objects(False, False, True)
assert_series_equal(results, s)

# test pass-through and non-conversion when other types selected
s = Series(['1.0','2.0','3.0'])
results = s.convert_objects(True,True,True)
expected = Series([1.0,2.0,3.0])
assert_series_equal(results, expected)
results = s.convert_objects(True,False,True)
assert_series_equal(results, s)

s = Series([datetime(2001, 1, 1, 0, 0),datetime(2001, 1, 1, 0, 0)],
dtype='O')
results = s.convert_objects(True,True,True)
expected = Series([datetime(2001, 1, 1, 0, 0),datetime(2001, 1, 1, 0, 0)])
assert_series_equal(results, expected)
results = s.convert_objects(False,True,True)
assert_series_equal(results, s)

td = datetime(2001, 1, 1, 0, 0) - datetime(2000, 1, 1, 0, 0)
s = Series([td, td], dtype='O')
results = s.convert_objects(True,True,True)
expected = Series([td, td])
assert_series_equal(results, expected)
results = s.convert_objects(True,True,False)
assert_series_equal(results, s)


s = Series([1., 2, 3], index=['a', 'b', 'c'])
result = s.convert_objects(convert_dates=False, convert_numeric=True)
Expand All @@ -5848,20 +5913,19 @@ def test_convert_objects(self):

r = s.copy().astype('O')
r['a'] = 'garbled'
expected = s.copy()
expected['a'] = np.nan
result = r.convert_objects(convert_dates=False, convert_numeric=True)
expected = s.copy()
expected['a'] = nan
assert_series_equal(result, expected)

# GH 4119, not converting a mixed type (e.g.floats and object)
s = Series([1, 'na', 3, 4])
result = s.convert_objects(convert_numeric=True)
expected = Series([1, np.nan, 3, 4])
expected = Series([1, nan, 3, 4])
assert_series_equal(result, expected)

s = Series([1, '', 3, 4])
result = s.convert_objects(convert_numeric=True)
expected = Series([1, np.nan, 3, 4])
assert_series_equal(result, expected)

# dates
Expand All @@ -5885,23 +5949,28 @@ def test_convert_objects(self):
[Timestamp(
'20010101'), Timestamp('20010102'), Timestamp('20010103'),
lib.NaT, lib.NaT, lib.NaT, Timestamp('20010104'), Timestamp('20010105')], dtype='M8[ns]')
result = s2.convert_objects(
convert_dates='coerce', convert_numeric=False)
result = s2.convert_objects(convert_dates='coerce',
convert_numeric=False,
convert_timedeltas=False)
assert_series_equal(result, expected)
result = s2.convert_objects(
convert_dates='coerce', convert_numeric=True)
result = s2.convert_objects(convert_dates='coerce',
convert_numeric=False,
convert_timedeltas=False)
assert_series_equal(result, expected)

# preserver all-nans (if convert_dates='coerce')
s = Series(['foo', 'bar', 1, 1.0], dtype='O')
result = s.convert_objects(
convert_dates='coerce', convert_numeric=False)
assert_series_equal(result, s)
result = s.convert_objects(convert_dates='coerce',
convert_numeric=False,
convert_timedeltas=False)
expected = Series([lib.NaT]*4)
assert_series_equal(result, expected)

# preserver if non-object
s = Series([1], dtype='float32')
result = s.convert_objects(
convert_dates='coerce', convert_numeric=False)
result = s.convert_objects(convert_dates='coerce',
convert_numeric=False,
convert_timedeltas=False)
assert_series_equal(result, s)

#r = s.copy()
Expand All @@ -5910,13 +5979,14 @@ def test_convert_objects(self):
#self.assertEqual(result.dtype, 'M8[ns]')

# dateutil parses some single letters into today's value as a date
expected = Series([lib.NaT])
for x in 'abcdefghijklmnopqrstuvwxyz':
s = Series([x])
result = s.convert_objects(convert_dates='coerce')
assert_series_equal(result, s)
assert_series_equal(result, expected)
s = Series([x.upper()])
result = s.convert_objects(convert_dates='coerce')
assert_series_equal(result, s)
assert_series_equal(result, expected)

def test_convert_objects_preserve_bool(self):
s = Series([1, True, 3, 5], dtype=object)
Expand Down

0 comments on commit 7295e4f

Please sign in to comment.