Skip to content

Commit

Permalink
BUG: Ensure 'coerce' actually coerces datatypes
Browse files Browse the repository at this point in the history
Changes behavior of convert objects so that passing 'coerce' will
ensure that data of the correct type is returned, even if all
values are null-types (NaN or NaT).

closes #9589
  • Loading branch information
Kevin Sheppard committed Jun 4, 2015
1 parent 93150ba commit d9f7951
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 62 deletions.
78 changes: 35 additions & 43 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ class SettingWithCopyError(ValueError):
class SettingWithCopyWarning(Warning):
pass


class AmbiguousIndexError(PandasError, KeyError):
pass

Expand Down Expand Up @@ -1894,54 +1893,47 @@ def _possibly_convert_objects(values, convert_dates=True,
if not hasattr(values, 'dtype'):
values = np.array([values], dtype=np.object_)

# convert dates
if convert_dates and values.dtype == np.object_:
# If not object, do not attempt conversion
if not is_object_dtype(values.dtype):
return values

# we take an aggressive stance and convert to datetime64[ns]
if convert_dates == 'coerce':
new_values = _possibly_cast_to_datetime(
values, 'M8[ns]', coerce=True)
# If 1 flag is coerce, ensure 2 others are False
conversions = (convert_dates, convert_numeric, convert_timedeltas)
if 'coerce' in conversions:
coerce_count = sum([c == 'coerce' for c in conversions])
if coerce_count > 1:
raise ValueError("'coerce' can be used at most once.")

# if we are all nans then leave me alone
if not isnull(new_values).all():
values = new_values

else:
values = lib.maybe_convert_objects(
values, convert_datetime=convert_dates)

# convert timedeltas
if convert_timedeltas and values.dtype == np.object_:

if convert_timedeltas == 'coerce':
# Immediate return if coerce
if convert_dates == 'coerce':
return _possibly_cast_to_datetime(values, 'M8[ns]', coerce=True)
elif convert_timedeltas == 'coerce':
from pandas.tseries.timedeltas import to_timedelta
values = to_timedelta(values, coerce=True)

return np.asanyarray(to_timedelta(values, coerce=True))
elif convert_numeric == 'coerce':
return lib.maybe_convert_numeric(values, set(), coerce_numeric=True)

# Soft conversions
if convert_dates:
values = lib.maybe_convert_objects(values,
convert_datetime=convert_dates)

if convert_timedeltas and is_object_dtype(values.dtype):
# Object check to ensure only run if previous did not completely
# convert
values = lib.maybe_convert_objects(values,
convert_timedelta=convert_timedeltas)
# convert to numeric
if convert_numeric and is_object_dtype(values.dtype):
# Only if previous failed
try:
new_values = lib.maybe_convert_numeric(values, set(),
coerce_numeric=True)
# if we are all nans then leave me alone
if not isnull(new_values).all():
values = new_values

else:
values = lib.maybe_convert_objects(
values, convert_timedelta=convert_timedeltas)

# convert to numeric
if values.dtype == np.object_:
if convert_numeric:
try:
new_values = lib.maybe_convert_numeric(
values, set(), coerce_numeric=True)

# if we are all nans then leave me alone
if not isnull(new_values).all():
values = new_values

except:
pass
else:

# soft-conversion
values = lib.maybe_convert_objects(values)
except:
pass

return values

Expand Down
6 changes: 4 additions & 2 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1484,8 +1484,10 @@ def convert(self, convert_dates=True, convert_numeric=True, convert_timedeltas=T
else:

values = com._possibly_convert_objects(
self.values.ravel(), convert_dates=convert_dates,
convert_numeric=convert_numeric
self.values.ravel(),
convert_dates=convert_dates,
convert_numeric=convert_numeric,
convert_timedeltas=convert_timedeltas
).reshape(self.values.shape)
blocks.append(make_block(values,
ndim=self.ndim, placement=self.mgr_locs))
Expand Down
104 changes: 87 additions & 17 deletions pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@
from inspect import getargspec
from itertools import product, starmap
from distutils.version import LooseVersion
import warnings

import nose

from numpy import nan, inf
import numpy as np
import numpy.ma as ma
import pandas as pd
import pandas.lib as lib

import pandas as pd
from pandas import (Index, Series, DataFrame, isnull, notnull, bdate_range,
date_range, period_range, timedelta_range)
from pandas.core.index import MultiIndex
Expand All @@ -25,11 +26,8 @@
from pandas.tseries.tdi import Timedelta, TimedeltaIndex
import pandas.core.common as com
import pandas.core.config as cf
import pandas.lib as lib

import pandas.core.datetools as datetools
import pandas.core.nanops as nanops

from pandas.compat import StringIO, lrange, range, zip, u, OrderedDict, long
from pandas import compat
from pandas.util.testing import (assert_series_equal,
Expand All @@ -39,6 +37,7 @@
import pandas.util.testing as tm



#------------------------------------------------------------------------------
# Series test cases

Expand Down Expand Up @@ -3442,7 +3441,6 @@ def test_ops_datetimelike_align(self):

def test_timedelta64_functions(self):

from datetime import timedelta
from pandas import date_range

# index min/max
Expand Down Expand Up @@ -5830,6 +5828,72 @@ def test_apply_dont_convert_dtype(self):
self.assertEqual(result.dtype, object)

def test_convert_objects(self):
# Tests: All to nans, coerce, true
# Test coercion returns correct type
s = Series(['a', 'b', 'c'])
results = s.convert_objects('coerce', False, False)
expected = Series([lib.NaT] * 3)
assert_series_equal(results, expected)

results = s.convert_objects(False, 'coerce', False)
expected = Series([np.nan] * 3)
assert_series_equal(results, expected)

expected = Series([lib.NaT] * 3, dtype=np.dtype('m8[ns]'))
results = s.convert_objects(False, False, 'coerce')
assert_series_equal(results, expected)

dt = datetime(2001, 1, 1, 0, 0)
td = dt - datetime(2000, 1, 1, 0, 0)
# Test coercion with mixed types
s = Series(['a', '3.1415', dt, td])
results = s.convert_objects('coerce',False,False)
expected = Series([lib.NaT, lib.NaT, dt, lib.NaT])
assert_series_equal(results, expected)

results = s.convert_objects(False, 'coerce',False)
expected = Series([nan, 3.1415, nan, nan])
assert_series_equal(results, expected)

results = s.convert_objects(False, False, 'coerce')
expected = Series([lib.NaT, lib.NaT, lib.NaT, td],
dtype=np.dtype('m8[ns]'))
assert_series_equal(results, expected)

# Test standard conversion returns original
results = s.convert_objects(True, False, False)
assert_series_equal(results, s)
# TODO: This test fails since numeric conversion
# TODO: is different from date or ts conversion
#results = s.convert_objects(False, True, False)
#assert_series_equal(results, s)
results = s.convert_objects(False, False, True)
assert_series_equal(results, s)

# test pass-through and non-conversion when other types selected
s = Series(['1.0','2.0','3.0'])
results = s.convert_objects(True,True,True)
expected = Series([1.0,2.0,3.0])
assert_series_equal(results, expected)
results = s.convert_objects(True,False,True)
assert_series_equal(results, s)

s = Series([datetime(2001, 1, 1, 0, 0),datetime(2001, 1, 1, 0, 0)],
dtype='O')
results = s.convert_objects(True,True,True)
expected = Series([datetime(2001, 1, 1, 0, 0),datetime(2001, 1, 1, 0, 0)])
assert_series_equal(results, expected)
results = s.convert_objects(False,True,True)
assert_series_equal(results, s)

td = datetime(2001, 1, 1, 0, 0) - datetime(2000, 1, 1, 0, 0)
s = Series([td, td], dtype='O')
results = s.convert_objects(True,True,True)
expected = Series([td, td])
assert_series_equal(results, expected)
results = s.convert_objects(True,True,False)
assert_series_equal(results, s)


s = Series([1., 2, 3], index=['a', 'b', 'c'])
result = s.convert_objects(convert_dates=False, convert_numeric=True)
Expand Down Expand Up @@ -5885,23 +5949,28 @@ def test_convert_objects(self):
[Timestamp(
'20010101'), Timestamp('20010102'), Timestamp('20010103'),
lib.NaT, lib.NaT, lib.NaT, Timestamp('20010104'), Timestamp('20010105')], dtype='M8[ns]')
result = s2.convert_objects(
convert_dates='coerce', convert_numeric=False)
result = s2.convert_objects(convert_dates='coerce',
convert_numeric=False,
convert_timedeltas=False)
assert_series_equal(result, expected)
result = s2.convert_objects(
convert_dates='coerce', convert_numeric=True)
result = s2.convert_objects(convert_dates='coerce',
convert_numeric=False,
convert_timedeltas=False)
assert_series_equal(result, expected)

# preserver all-nans (if convert_dates='coerce')
s = Series(['foo', 'bar', 1, 1.0], dtype='O')
result = s.convert_objects(
convert_dates='coerce', convert_numeric=False)
assert_series_equal(result, s)
result = s.convert_objects(convert_dates='coerce',
convert_numeric=False,
convert_timedeltas=False)
expected = Series([lib.NaT]*4)
assert_series_equal(result, expected)

# preserver if non-object
s = Series([1], dtype='float32')
result = s.convert_objects(
convert_dates='coerce', convert_numeric=False)
result = s.convert_objects(convert_dates='coerce',
convert_numeric=False,
convert_timedeltas=False)
assert_series_equal(result, s)

#r = s.copy()
Expand All @@ -5910,13 +5979,14 @@ def test_convert_objects(self):
#self.assertEqual(result.dtype, 'M8[ns]')

# dateutil parses some single letters into today's value as a date
expected = Series([lib.NaT])
for x in 'abcdefghijklmnopqrstuvwxyz':
s = Series([x])
result = s.convert_objects(convert_dates='coerce')
assert_series_equal(result, s)
assert_series_equal(result, expected)
s = Series([x.upper()])
result = s.convert_objects(convert_dates='coerce')
assert_series_equal(result, s)
assert_series_equal(result, expected)

def test_convert_objects_preserve_bool(self):
s = Series([1, True, 3, 5], dtype=object)
Expand Down

0 comments on commit d9f7951

Please sign in to comment.