Skip to content

Commit

Permalink
BUG: Ensure 'coerce' actually coerces datatypes
Browse files Browse the repository at this point in the history
Changes behavior of convert objects so that passing 'coerce' will
ensure that data of the correct type is returned, even if all
values are null-types (NaN or NaT).

closes #9589
  • Loading branch information
Kevin Sheppard committed Jun 3, 2015
1 parent efc4a08 commit 44fa303
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 60 deletions.
82 changes: 41 additions & 41 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ class SettingWithCopyError(ValueError):
class SettingWithCopyWarning(Warning):
pass

class RedundantSettingWarning(Warning):
pass

class AmbiguousIndexError(PandasError, KeyError):
pass
Expand Down Expand Up @@ -1888,60 +1890,58 @@ def _possibly_convert_objects(values, convert_dates=True,
convert_timedeltas=True):
""" if we have an object dtype, try to coerce dates and/or numbers """

# If 1 flag is coerce, ensure 2 others are False
conversions = (convert_dates, convert_numeric, convert_timedeltas)
if 'coerce' in conversions:
coerce_count = sum([c == 'coerce' for c in conversions])
if coerce_count > 1:
raise ValueError("'coerce' can be used at most once.")

false_count = sum([not c for c in conversions])
if false_count != 2:
import warnings
warnings.warn("Soft conversion flags ignored when using 'coerce'",
RedundantSettingWarning)

# if we have passed in a list or scalar
if isinstance(values, (list, tuple)):
values = np.array(values, dtype=np.object_)
if not hasattr(values, 'dtype'):
values = np.array([values], dtype=np.object_)

# convert dates
if convert_dates and values.dtype == np.object_:

# we take an aggressive stance and convert to datetime64[ns]
if convert_dates == 'coerce':
new_values = _possibly_cast_to_datetime(
values, 'M8[ns]', coerce=True)
# If not object, do not convert
if values.dtype != np.object_:
return values

# if we are all nans then leave me alone
if not isnull(new_values).all():
values = new_values

else:
values = lib.maybe_convert_objects(
values, convert_datetime=convert_dates)
# Immediate return if coerce
if convert_dates == 'coerce':
return _possibly_cast_to_datetime(values, 'M8[ns]', coerce=True)
if convert_timedeltas == 'coerce':
from pandas.tseries.timedeltas import to_timedelta
return np.asanyarray(to_timedelta(values, coerce=True))
if convert_numeric == 'coerce':
return lib.maybe_convert_numeric(values, set(), coerce_numeric=True)

# convert dates
if convert_dates:
values = lib.maybe_convert_objects(values,
convert_datetime=convert_dates)
# convert timedeltas
if convert_timedeltas and values.dtype == np.object_:

if convert_timedeltas == 'coerce':
from pandas.tseries.timedeltas import to_timedelta
values = to_timedelta(values, coerce=True)

# Only if previous failed
values = lib.maybe_convert_objects(values,
convert_timedelta=convert_timedeltas)
# convert to numeric
if convert_numeric and values.dtype == np.object_:
# Only if previous failed
try:
new_values = lib.maybe_convert_numeric(values, set(),
coerce_numeric=True)
# if we are all nans then leave me alone
if not isnull(new_values).all():
values = new_values

else:
values = lib.maybe_convert_objects(
values, convert_timedelta=convert_timedeltas)

# convert to numeric
if values.dtype == np.object_:
if convert_numeric:
try:
new_values = lib.maybe_convert_numeric(
values, set(), coerce_numeric=True)

# if we are all nans then leave me alone
if not isnull(new_values).all():
values = new_values

except:
pass
else:

# soft-conversion
values = lib.maybe_convert_objects(values)
except:
pass

return values

Expand Down
6 changes: 4 additions & 2 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -1484,8 +1484,10 @@ def convert(self, convert_dates=True, convert_numeric=True, convert_timedeltas=T
else:

values = com._possibly_convert_objects(
self.values.ravel(), convert_dates=convert_dates,
convert_numeric=convert_numeric
self.values.ravel(),
convert_dates=convert_dates,
convert_numeric=convert_numeric,
convert_timedeltas=convert_timedeltas
).reshape(self.values.shape)
blocks.append(make_block(values,
ndim=self.ndim, placement=self.mgr_locs))
Expand Down
56 changes: 39 additions & 17 deletions pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@
from inspect import getargspec
from itertools import product, starmap
from distutils.version import LooseVersion
import warnings

import nose

from numpy import nan, inf
import numpy as np
import numpy.ma as ma
import pandas as pd
import pandas.lib as lib

import pandas as pd
from pandas import (Index, Series, DataFrame, isnull, notnull, bdate_range,
date_range, period_range, timedelta_range)
from pandas.core.index import MultiIndex
Expand All @@ -25,11 +26,8 @@
from pandas.tseries.tdi import Timedelta, TimedeltaIndex
import pandas.core.common as com
import pandas.core.config as cf
import pandas.lib as lib

import pandas.core.datetools as datetools
import pandas.core.nanops as nanops

from pandas.compat import StringIO, lrange, range, zip, u, OrderedDict, long
from pandas import compat
from pandas.util.testing import (assert_series_equal,
Expand All @@ -39,6 +37,7 @@
import pandas.util.testing as tm



#------------------------------------------------------------------------------
# Series test cases

Expand Down Expand Up @@ -3432,7 +3431,6 @@ def test_ops_datetimelike_align(self):

def test_timedelta64_functions(self):

from datetime import timedelta
from pandas import date_range

# index min/max
Expand Down Expand Up @@ -5820,6 +5818,24 @@ def test_apply_dont_convert_dtype(self):
self.assertEqual(result.dtype, object)

def test_convert_objects(self):
# Tests: All to nans, coerce, true
# Test coercion returns correct type
s = Series(['a', 'b', 'c'])
results = s.convert_objects('coerce', False, False)
expected = Series([lib.NaT] * 3)
assert_series_equal(results, expected)

results = s.convert_objects(False, 'coerce', False)
expected = Series([np.nan] * 3)
assert_series_equal(results, expected)

results = s.convert_objects(False, False, 'coerce')
expected = Series([lib.NaT] * 3, dtype=np.dtype('m8[ns]'))
assert_series_equal(results, expected)

with warnings.catch_warnings(record=True) as w:
tm.assert_produces_warning(s.convert_objects(True, True, 'coerce'),
com.RedundantSettingWarning)

s = Series([1., 2, 3], index=['a', 'b', 'c'])
result = s.convert_objects(convert_dates=False, convert_numeric=True)
Expand Down Expand Up @@ -5875,23 +5891,28 @@ def test_convert_objects(self):
[Timestamp(
'20010101'), Timestamp('20010102'), Timestamp('20010103'),
lib.NaT, lib.NaT, lib.NaT, Timestamp('20010104'), Timestamp('20010105')], dtype='M8[ns]')
result = s2.convert_objects(
convert_dates='coerce', convert_numeric=False)
result = s2.convert_objects(convert_dates='coerce',
convert_numeric=False,
convert_timedeltas=False)
assert_series_equal(result, expected)
result = s2.convert_objects(
convert_dates='coerce', convert_numeric=True)
result = s2.convert_objects(convert_dates='coerce',
convert_numeric=False,
convert_timedeltas=False)
assert_series_equal(result, expected)

# preserver all-nans (if convert_dates='coerce')
s = Series(['foo', 'bar', 1, 1.0], dtype='O')
result = s.convert_objects(
convert_dates='coerce', convert_numeric=False)
assert_series_equal(result, s)
result = s.convert_objects(convert_dates='coerce',
convert_numeric=False,
convert_timedeltas=False)
expected = Series([lib.NaT]*4)
assert_series_equal(result, expected)

# preserver if non-object
s = Series([1], dtype='float32')
result = s.convert_objects(
convert_dates='coerce', convert_numeric=False)
result = s.convert_objects(convert_dates='coerce',
convert_numeric=False,
convert_timedeltas=False)
assert_series_equal(result, s)

#r = s.copy()
Expand All @@ -5900,13 +5921,14 @@ def test_convert_objects(self):
#self.assertEqual(result.dtype, 'M8[ns]')

# dateutil parses some single letters into today's value as a date
expected = Series([lib.NaT])
for x in 'abcdefghijklmnopqrstuvwxyz':
s = Series([x])
result = s.convert_objects(convert_dates='coerce')
assert_series_equal(result, s)
assert_series_equal(result, expected)
s = Series([x.upper()])
result = s.convert_objects(convert_dates='coerce')
assert_series_equal(result, s)
assert_series_equal(result, expected)

def test_convert_objects_preserve_bool(self):
s = Series([1, True, 3, 5], dtype=object)
Expand Down

0 comments on commit 44fa303

Please sign in to comment.