Skip to content

Commit

Permalink
BUG: Ensure 'coerce' actually coerces datatypes
Browse files Browse the repository at this point in the history
Changes behavior of convert objects so that passing 'coerce' will
ensure that data of the correct type is returned, even if all
values are null-types (NaN or NaT).

closes pandas-dev#9589
  • Loading branch information
Kevin Sheppard committed Jun 4, 2015
1 parent bc7d48f commit 7d5b7da
Show file tree
Hide file tree
Showing 5 changed files with 137 additions and 73 deletions.
81 changes: 37 additions & 44 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ class SettingWithCopyError(ValueError):
class SettingWithCopyWarning(Warning):
pass


class AmbiguousIndexError(PandasError, KeyError):
pass

Expand Down Expand Up @@ -1894,54 +1893,48 @@ def _possibly_convert_objects(values, convert_dates=True,
if not hasattr(values, 'dtype'):
values = np.array([values], dtype=np.object_)

# convert dates
if convert_dates and values.dtype == np.object_:

# we take an aggressive stance and convert to datetime64[ns]
if convert_dates == 'coerce':
new_values = _possibly_cast_to_datetime(
values, 'M8[ns]', coerce=True)

# if we are all nans then leave me alone
if not isnull(new_values).all():
values = new_values

else:
values = lib.maybe_convert_objects(
values, convert_datetime=convert_dates)
# If not object, do not attempt conversion
if not is_object_dtype(values.dtype):
return values

# convert timedeltas
if convert_timedeltas and values.dtype == np.object_:
# If 1 flag is coerce, ensure 2 others are False
conversions = (convert_dates, convert_numeric, convert_timedeltas)
if 'coerce' in conversions:
coerce_count = sum([c == 'coerce' for c in conversions])
if coerce_count > 1:
raise ValueError("'coerce' can be used at most once.")

if convert_timedeltas == 'coerce':
# Immediate return if coerce
if convert_dates == 'coerce':
return _possibly_cast_to_datetime(values, 'M8[ns]', coerce=True)
elif convert_timedeltas == 'coerce':
from pandas.tseries.timedeltas import to_timedelta
values = to_timedelta(values, coerce=True)

# if we are all nans then leave me alone
if not isnull(new_values).all():
values = new_values

else:
values = lib.maybe_convert_objects(
values, convert_timedelta=convert_timedeltas)

return to_timedelta(values, coerce=True, box=False)
elif convert_numeric == 'coerce':
return lib.maybe_convert_numeric(values, set(), coerce_numeric=True)

# Soft conversions
if convert_dates:
values = lib.maybe_convert_objects(values,
convert_datetime=convert_dates)

if convert_timedeltas and is_object_dtype(values.dtype):
# Object check to ensure only run if previous did not completely
# convert
values = lib.maybe_convert_objects(values,
convert_timedelta=convert_timedeltas)
# convert to numeric
if values.dtype == np.object_:
if convert_numeric:
try:
new_values = lib.maybe_convert_numeric(
values, set(), coerce_numeric=True)

# if we are all nans then leave me alone
if not isnull(new_values).all():
values = new_values

except:
pass
else:
if convert_numeric and is_object_dtype(values.dtype):
# Only if previous failed
try:
converted = lib.maybe_convert_numeric(values,
set(),
coerce_numeric=True)
# If all NaNs, then do not-alter
values = converted if not isnull(converted).all() else values

# soft-conversion
values = lib.maybe_convert_objects(values)
except:
pass

return values

Expand Down
11 changes: 6 additions & 5 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2939,12 +2939,13 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):

# if we have date/time like in the original, then coerce dates
# as we are stacking can easily have object dtypes here
if (self._selected_obj.ndim == 2
and self._selected_obj.dtypes.isin(_DATELIKE_DTYPES).any()):
cd = 'coerce'
if (self._selected_obj.ndim == 2 and self._selected_obj.dtypes.isin(_DATELIKE_DTYPES).any()):
result = result.convert_objects(convert_dates=False, convert_numeric=True)
date_cols = [col for col, is_date in zip(result, self._selected_obj.dtypes.isin(_DATELIKE_DTYPES)) if is_date]
result[date_cols] = result[date_cols].convert_objects(convert_dates='coerce')
else:
cd = True
result = result.convert_objects(convert_dates=cd)
result = result.convert_objects(convert_dates=True)

return self._reindex_output(result)

else:
Expand Down
6 changes: 4 additions & 2 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1484,8 +1484,10 @@ def convert(self, convert_dates=True, convert_numeric=True, convert_timedeltas=T
else:

values = com._possibly_convert_objects(
self.values.ravel(), convert_dates=convert_dates,
convert_numeric=convert_numeric
self.values.ravel(),
convert_dates=convert_dates,
convert_numeric=convert_numeric,
convert_timedeltas=convert_timedeltas
).reshape(self.values.shape)
blocks.append(make_block(values,
ndim=self.ndim, placement=self.mgr_locs))
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -599,7 +599,7 @@ def f(grp):
return grp.iloc[0]
result = df.groupby('A').apply(f)[['C']]
e = df.groupby('A').first()[['C']]
e.loc['Pony'] = np.nan
e.loc['Pony'] = pd.NaT
assert_frame_equal(result,e)

# scalar outputs
Expand Down
110 changes: 89 additions & 21 deletions pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@
from inspect import getargspec
from itertools import product, starmap
from distutils.version import LooseVersion
import warnings

import nose

from numpy import nan, inf
import numpy as np
import numpy.ma as ma
import pandas as pd
import pandas.lib as lib

import pandas as pd
from pandas import (Index, Series, DataFrame, isnull, notnull, bdate_range,
date_range, period_range, timedelta_range)
from pandas.core.index import MultiIndex
Expand All @@ -25,11 +26,8 @@
from pandas.tseries.tdi import Timedelta, TimedeltaIndex
import pandas.core.common as com
import pandas.core.config as cf
import pandas.lib as lib

import pandas.core.datetools as datetools
import pandas.core.nanops as nanops

from pandas.compat import StringIO, lrange, range, zip, u, OrderedDict, long
from pandas import compat
from pandas.util.testing import (assert_series_equal,
Expand All @@ -39,6 +37,7 @@
import pandas.util.testing as tm



#------------------------------------------------------------------------------
# Series test cases

Expand Down Expand Up @@ -3442,7 +3441,6 @@ def test_ops_datetimelike_align(self):

def test_timedelta64_functions(self):

from datetime import timedelta
from pandas import date_range

# index min/max
Expand Down Expand Up @@ -5830,6 +5828,71 @@ def test_apply_dont_convert_dtype(self):
self.assertEqual(result.dtype, object)

def test_convert_objects(self):
# Tests: All to nans, coerce, true
# Test coercion returns correct type
s = Series(['a', 'b', 'c'])
results = s.convert_objects('coerce', False, False)
expected = Series([lib.NaT] * 3)
assert_series_equal(results, expected)

results = s.convert_objects(False, 'coerce', False)
expected = Series([np.nan] * 3)
assert_series_equal(results, expected)

expected = Series([lib.NaT] * 3, dtype=np.dtype('m8[ns]'))
results = s.convert_objects(False, False, 'coerce')
assert_series_equal(results, expected)

dt = datetime(2001, 1, 1, 0, 0)
td = dt - datetime(2000, 1, 1, 0, 0)
# Test coercion with mixed types
s = Series(['a', '3.1415', dt, td])
results = s.convert_objects('coerce',False,False)
expected = Series([lib.NaT, lib.NaT, dt, lib.NaT])
assert_series_equal(results, expected)

results = s.convert_objects(False, 'coerce',False)
expected = Series([nan, 3.1415, nan, nan])
assert_series_equal(results, expected)

results = s.convert_objects(False, False, 'coerce')
expected = Series([lib.NaT, lib.NaT, lib.NaT, td],
dtype=np.dtype('m8[ns]'))
assert_series_equal(results, expected)

# Test standard conversion returns original
results = s.convert_objects(True, False, False)
assert_series_equal(results, s)
results = s.convert_objects(False, True, False)
expected = Series([nan, 3.1415, nan, nan])
assert_series_equal(results, expected)
results = s.convert_objects(False, False, True)
assert_series_equal(results, s)

# test pass-through and non-conversion when other types selected
s = Series(['1.0','2.0','3.0'])
results = s.convert_objects(True,True,True)
expected = Series([1.0,2.0,3.0])
assert_series_equal(results, expected)
results = s.convert_objects(True,False,True)
assert_series_equal(results, s)

s = Series([datetime(2001, 1, 1, 0, 0),datetime(2001, 1, 1, 0, 0)],
dtype='O')
results = s.convert_objects(True,True,True)
expected = Series([datetime(2001, 1, 1, 0, 0),datetime(2001, 1, 1, 0, 0)])
assert_series_equal(results, expected)
results = s.convert_objects(False,True,True)
assert_series_equal(results, s)

td = datetime(2001, 1, 1, 0, 0) - datetime(2000, 1, 1, 0, 0)
s = Series([td, td], dtype='O')
results = s.convert_objects(True,True,True)
expected = Series([td, td])
assert_series_equal(results, expected)
results = s.convert_objects(True,True,False)
assert_series_equal(results, s)


s = Series([1., 2, 3], index=['a', 'b', 'c'])
result = s.convert_objects(convert_dates=False, convert_numeric=True)
Expand All @@ -5848,20 +5911,19 @@ def test_convert_objects(self):

r = s.copy().astype('O')
r['a'] = 'garbled'
expected = s.copy()
expected['a'] = np.nan
result = r.convert_objects(convert_dates=False, convert_numeric=True)
expected = s.copy()
expected['a'] = nan
assert_series_equal(result, expected)

# GH 4119, not converting a mixed type (e.g.floats and object)
s = Series([1, 'na', 3, 4])
result = s.convert_objects(convert_numeric=True)
expected = Series([1, np.nan, 3, 4])
expected = Series([1, nan, 3, 4])
assert_series_equal(result, expected)

s = Series([1, '', 3, 4])
result = s.convert_objects(convert_numeric=True)
expected = Series([1, np.nan, 3, 4])
assert_series_equal(result, expected)

# dates
Expand All @@ -5885,23 +5947,28 @@ def test_convert_objects(self):
[Timestamp(
'20010101'), Timestamp('20010102'), Timestamp('20010103'),
lib.NaT, lib.NaT, lib.NaT, Timestamp('20010104'), Timestamp('20010105')], dtype='M8[ns]')
result = s2.convert_objects(
convert_dates='coerce', convert_numeric=False)
result = s2.convert_objects(convert_dates='coerce',
convert_numeric=False,
convert_timedeltas=False)
assert_series_equal(result, expected)
result = s2.convert_objects(
convert_dates='coerce', convert_numeric=True)
result = s2.convert_objects(convert_dates='coerce',
convert_numeric=False,
convert_timedeltas=False)
assert_series_equal(result, expected)

# preserver all-nans (if convert_dates='coerce')
s = Series(['foo', 'bar', 1, 1.0], dtype='O')
result = s.convert_objects(
convert_dates='coerce', convert_numeric=False)
assert_series_equal(result, s)
result = s.convert_objects(convert_dates='coerce',
convert_numeric=False,
convert_timedeltas=False)
expected = Series([lib.NaT]*4)
assert_series_equal(result, expected)

# preserver if non-object
s = Series([1], dtype='float32')
result = s.convert_objects(
convert_dates='coerce', convert_numeric=False)
result = s.convert_objects(convert_dates='coerce',
convert_numeric=False,
convert_timedeltas=False)
assert_series_equal(result, s)

#r = s.copy()
Expand All @@ -5910,13 +5977,14 @@ def test_convert_objects(self):
#self.assertEqual(result.dtype, 'M8[ns]')

# dateutil parses some single letters into today's value as a date
expected = Series([lib.NaT])
for x in 'abcdefghijklmnopqrstuvwxyz':
s = Series([x])
result = s.convert_objects(convert_dates='coerce')
assert_series_equal(result, s)
assert_series_equal(result, expected)
s = Series([x.upper()])
result = s.convert_objects(convert_dates='coerce')
assert_series_equal(result, s)
assert_series_equal(result, expected)

def test_convert_objects_preserve_bool(self):
s = Series([1, True, 3, 5], dtype=object)
Expand Down

0 comments on commit 7d5b7da

Please sign in to comment.