From 0727803b27e6b0299a903a47aecde0b747c91b7e Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Sun, 12 Jul 2015 23:15:13 -0400 Subject: [PATCH] BUG: Ensure 'coerce' actually coerces datatypes Changes behavior of convert objects so that passing 'coerce' will ensure that data of the correct type is returned, even if all values are null-types (NaN or NaT). closes #9589 --- doc/source/basics.rst | 25 ++++-- doc/source/whatsnew/v0.17.0.txt | 39 +++++++++ pandas/core/common.py | 106 ++++++++++++------------- pandas/core/frame.py | 11 ++- pandas/core/generic.py | 33 ++++---- pandas/core/groupby.py | 32 +++++--- pandas/core/internals.py | 23 +++--- pandas/io/tests/test_html.py | 8 +- pandas/io/tests/test_pytables.py | 10 +-- pandas/io/tests/test_stata.py | 2 +- pandas/io/wb.py | 2 +- pandas/tests/test_frame.py | 14 ++-- pandas/tests/test_groupby.py | 2 +- pandas/tests/test_indexing.py | 3 +- pandas/tests/test_internals.py | 4 +- pandas/tests/test_panel.py | 2 +- pandas/tests/test_series.py | 131 ++++++++++++++++++++++++------- pandas/tools/plotting.py | 5 +- pandas/tslib.pyx | 2 + 19 files changed, 301 insertions(+), 153 deletions(-) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 349e7e25fdafb..b5085ea1c55c4 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -1522,23 +1522,29 @@ then the more *general* one will be used as the result of the operation. object conversion ~~~~~~~~~~~~~~~~~ -:meth:`~DataFrame.convert_objects` is a method to try to force conversion of types from the ``object`` dtype to other types. -To force conversion of specific types that are *number like*, e.g. could be a string that represents a number, -pass ``convert_numeric=True``. This will force strings and numbers alike to be numbers if possible, otherwise -they will be set to ``np.nan``. +.. note:: + +The syntax of :meth:`~DataFrame.convert_objects` changed in 0.17.0. + +:meth:`~DataFrame.convert_objects` is a method to try to force conversion of +types from the ``object`` dtype to other types. To try converting specific +types that are *number like*, e.g. could be a string that represents a number, +pass ``numeric=True``. The force the conversion, add the keword argument +``coerce=True``. This will force strings and numbers alike to be numbers if +possible, otherwise they will be set to ``np.nan``. .. ipython:: python df3['D'] = '1.' df3['E'] = '1' - df3.convert_objects(convert_numeric=True).dtypes + df3.convert_objects(numeric=True).dtypes # same, but specific dtype conversion df3['D'] = df3['D'].astype('float16') df3['E'] = df3['E'].astype('int32') df3.dtypes -To force conversion to ``datetime64[ns]``, pass ``convert_dates='coerce'``. +To force conversion to ``datetime64[ns]``, pass ``datetime=True`` and ``coerce=True``. This will convert any datetime-like object to dates, forcing other values to ``NaT``. This might be useful if you are reading in data which is mostly dates, but occasionally has non-dates intermixed and you want to represent as missing. @@ -1550,10 +1556,13 @@ but occasionally has non-dates intermixed and you want to represent as missing. 'foo', 1.0, 1, pd.Timestamp('20010104'), '20010105'], dtype='O') s - s.convert_objects(convert_dates='coerce') + s.convert_objects(datetime=True, coerce=True) -In addition, :meth:`~DataFrame.convert_objects` will attempt the *soft* conversion of any *object* dtypes, meaning that if all +Without passing ``coerce=True``, :meth:`~DataFrame.convert_objects` will attempt +the *soft* conversion of any *object* dtypes, meaning that if all the objects in a Series are of the same type, the Series will have that dtype. +Setting ``coerce=True`` will not *convert* - for example, a series of string +dates will not be converted to a series of datetimes. gotchas ~~~~~~~ diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index a3ec13439fe76..41569fd1eda65 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -48,6 +48,44 @@ Backwards incompatible API changes .. _whatsnew_0170.api_breaking.other: +Changes to convert_objects +^^^^^^^^^^^^^^^^^^^^^^^^^^ +- ``DataFrame.convert_objects`` keyword arguments have been shortened. (:issue:`10265`) + +===================== ============= +Old New +===================== ============= +``convert_dates`` ``datetime`` +``convert_numeric`` ``numeric`` +``convert_timedelta`` ``timedelta`` +===================== ============= + +- Coercing types with ``DataFrame.convert_objects`` is now implemented using the +keyword argument ``coerce=True``. Previously types were coerced by setting a +keyword argument to ``'coerce'`` instead of ``True``, as in ``convert_dates='coerce'``. + + .. ipython:: python + + df = pd.DataFrame({'i': ['1','2'], 'f': ['apple', '4.2']}) + df + + The old usage of ``DataFrame.convert_objects`` used `'coerce'` along with the + type. + + .. code-block:: python + + In [2]: df.convert_objects(convert_numeric='coerce') + + Now the ``coerce`` keyword must be explicitly used. + + .. ipython:: python + + df.convert_objects(numeric=True, coerce=True) + +- The new default behavior for ``DataFrame.convert_objects`` is to do nothing, +and so it is necessary to pass at least one conversion target when calling. + + Other API Changes ^^^^^^^^^^^^^^^^^ - Enable writing Excel files in :ref:`memory <_io.excel_writing_buffer>` using StringIO/BytesIO (:issue:`7074`) @@ -55,6 +93,7 @@ Other API Changes - Allow passing `kwargs` to the interpolation methods (:issue:`10378`). - Serialize metadata properties of subclasses of pandas objects (:issue:`10553`). + .. _whatsnew_0170.deprecations: Deprecations diff --git a/pandas/core/common.py b/pandas/core/common.py index 773ecea8f2712..2e20c25f2327d 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -1887,65 +1887,63 @@ def _maybe_box_datetimelike(value): _values_from_object = lib.values_from_object -def _possibly_convert_objects(values, convert_dates=True, - convert_numeric=True, - convert_timedeltas=True): + +def _possibly_convert_objects(values, + datetime=True, + numeric=True, + timedelta=True, + coerce=False): """ if we have an object dtype, try to coerce dates and/or numbers """ - # if we have passed in a list or scalar + conversion_count = sum((datetime, numeric, timedelta)) + if conversion_count == 0: + import warnings + warnings.warn('Must explicitly pass type for conversion. Original ' + 'value returned.', RuntimeWarning) + return values + if isinstance(values, (list, tuple)): + # List or scalar values = np.array(values, dtype=np.object_) - if not hasattr(values, 'dtype'): + elif not hasattr(values, 'dtype'): values = np.array([values], dtype=np.object_) - - # convert dates - if convert_dates and values.dtype == np.object_: - - # we take an aggressive stance and convert to datetime64[ns] - if convert_dates == 'coerce': - new_values = _possibly_cast_to_datetime( - values, 'M8[ns]', coerce=True) - - # if we are all nans then leave me alone - if not isnull(new_values).all(): - values = new_values - - else: - values = lib.maybe_convert_objects( - values, convert_datetime=convert_dates) - - # convert timedeltas - if convert_timedeltas and values.dtype == np.object_: - - if convert_timedeltas == 'coerce': - from pandas.tseries.timedeltas import to_timedelta - values = to_timedelta(values, coerce=True) - - # if we are all nans then leave me alone - if not isnull(new_values).all(): - values = new_values - - else: - values = lib.maybe_convert_objects( - values, convert_timedelta=convert_timedeltas) - - # convert to numeric - if values.dtype == np.object_: - if convert_numeric: - try: - new_values = lib.maybe_convert_numeric( - values, set(), coerce_numeric=True) - - # if we are all nans then leave me alone - if not isnull(new_values).all(): - values = new_values - - except: - pass - else: - - # soft-conversion - values = lib.maybe_convert_objects(values) + elif not is_object_dtype(values.dtype): + # If not object, do not attempt conversion + return values + + # If 1 flag is coerce, ensure 2 others are False + if coerce: + if conversion_count > 1: + raise ValueError("Only one of 'datetime', 'numeric' or " + "'timedelta' can be True when when coerce=True.") + + # Immediate return if coerce + if datetime: + return pd.to_datetime(values, coerce=True, box=False) + elif timedelta: + return pd.to_timedelta(values, coerce=True, box=False) + elif numeric: + return lib.maybe_convert_numeric(values, set(), coerce_numeric=True) + + # Soft conversions + if datetime: + values = lib.maybe_convert_objects(values, + convert_datetime=datetime) + + if timedelta and is_object_dtype(values.dtype): + # Object check to ensure only run if previous did not convert + values = lib.maybe_convert_objects(values, + convert_timedelta=timedelta) + + if numeric and is_object_dtype(values.dtype): + try: + converted = lib.maybe_convert_numeric(values, + set(), + coerce_numeric=True) + # If all NaNs, then do not-alter + values = converted if not isnull(converted).all() else values + except: + pass return values diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bb192aeca5b6d..a7ecb74a67485 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3351,7 +3351,7 @@ def combine(self, other, func, fill_value=None, overwrite=True): return self._constructor(result, index=new_index, columns=new_columns).convert_objects( - convert_dates=True, + datetime=True, copy=False) def combine_first(self, other): @@ -3830,7 +3830,9 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True): if axis == 1: result = result.T - result = result.convert_objects(copy=False) + result = result.convert_objects(datetime=True, + timedelta=True, + copy=False) else: @@ -3958,7 +3960,10 @@ def append(self, other, ignore_index=False, verify_integrity=False): combined_columns = self.columns.tolist() + self.columns.union(other.index).difference(self.columns).tolist() other = other.reindex(combined_columns, copy=False) other = DataFrame(other.values.reshape((1, len(other))), - index=index, columns=combined_columns).convert_objects() + index=index, + columns=combined_columns) + other = other.convert_objects(datetime=True, timedelta=True) + if not self.columns.equals(combined_columns): self = self.reindex(columns=combined_columns) elif isinstance(other, list) and not isinstance(other[0], DataFrame): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f39e953284f26..1656b306a0ddb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2433,22 +2433,26 @@ def copy(self, deep=True): data = self._data.copy(deep=deep) return self._constructor(data).__finalize__(self) - def convert_objects(self, convert_dates=True, convert_numeric=False, - convert_timedeltas=True, copy=True): + @deprecate_kwarg(old_arg_name='convert_dates', new_arg_name='datetime') + @deprecate_kwarg(old_arg_name='convert_numeric', new_arg_name='numeric') + @deprecate_kwarg(old_arg_name='convert_timedeltas', new_arg_name='timedelta') + def convert_objects(self, datetime=False, numeric=False, + timedelta=False, coerce=False, copy=True): """ Attempt to infer better dtype for object columns Parameters ---------- - convert_dates : boolean, default True - If True, convert to date where possible. If 'coerce', force - conversion, with unconvertible values becoming NaT. - convert_numeric : boolean, default False - If True, attempt to coerce to numbers (including strings), with + datetime : boolean, default False + If True, convert to date where possible. + numeric : boolean, default False + If True, attempt to convert to numbers (including strings), with unconvertible values becoming NaN. - convert_timedeltas : boolean, default True - If True, convert to timedelta where possible. If 'coerce', force - conversion, with unconvertible values becoming NaT. + timedelta : boolean, default False + If True, convert to timedelta where possible. + coerce : boolean, default False + If True, force conversion with unconvertible values converted to + nulls (NaN or NaT) copy : boolean, default True If True, return a copy even if no copy is necessary (e.g. no conversion was done). Note: This is meant for internal use, and @@ -2459,9 +2463,10 @@ def convert_objects(self, convert_dates=True, convert_numeric=False, converted : same as input object """ return self._constructor( - self._data.convert(convert_dates=convert_dates, - convert_numeric=convert_numeric, - convert_timedeltas=convert_timedeltas, + self._data.convert(datetime=datetime, + numeric=numeric, + timedelta=timedelta, + coerce=coerce, copy=copy)).__finalize__(self) #---------------------------------------------------------------------- @@ -2859,7 +2864,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, '{0!r}').format(type(to_replace).__name__) raise TypeError(msg) # pragma: no cover - new_data = new_data.convert(copy=not inplace, convert_numeric=False) + new_data = new_data.convert(copy=not inplace, numeric=False) if inplace: self._update_inplace(new_data) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 4abdd1112c721..df788f806eda6 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -111,7 +111,7 @@ def f(self): except Exception: result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) if _convert: - result = result.convert_objects() + result = result.convert_objects(datetime=True) return result f.__doc__ = "Compute %s of group values" % name @@ -2700,7 +2700,7 @@ def aggregate(self, arg, *args, **kwargs): self._insert_inaxis_grouper_inplace(result) result.index = np.arange(len(result)) - return result.convert_objects() + return result.convert_objects(datetime=True) def _aggregate_multiple_funcs(self, arg): from pandas.tools.merge import concat @@ -2939,18 +2939,25 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # if we have date/time like in the original, then coerce dates # as we are stacking can easily have object dtypes here - if (self._selected_obj.ndim == 2 - and self._selected_obj.dtypes.isin(_DATELIKE_DTYPES).any()): - cd = 'coerce' + if (self._selected_obj.ndim == 2 and + self._selected_obj.dtypes.isin(_DATELIKE_DTYPES).any()): + result = result.convert_objects(numeric=True) + date_cols = self._selected_obj.select_dtypes( + include=list(_DATELIKE_DTYPES)).columns + result[date_cols] = (result[date_cols] + .convert_objects(datetime=True, + coerce=True)) else: - cd = True - result = result.convert_objects(convert_dates=cd) + result = result.convert_objects(datetime=True) + return self._reindex_output(result) else: # only coerce dates if we find at least 1 datetime - cd = 'coerce' if any([ isinstance(v,Timestamp) for v in values ]) else False - return Series(values, index=key_index).convert_objects(convert_dates=cd) + coerce = True if any([ isinstance(v,Timestamp) for v in values ]) else False + return (Series(values, index=key_index) + .convert_objects(datetime=True, + coerce=coerce)) else: # Handle cases like BinGrouper @@ -3053,7 +3060,8 @@ def transform(self, func, *args, **kwargs): if any(counts == 0): results = self._try_cast(results, obj[result.columns]) - return DataFrame(results,columns=result.columns,index=obj.index).convert_objects() + return (DataFrame(results,columns=result.columns,index=obj.index) + .convert_objects(datetime=True)) def _define_paths(self, func, *args, **kwargs): if isinstance(func, compat.string_types): @@ -3246,7 +3254,7 @@ def _wrap_aggregated_output(self, output, names=None): if self.axis == 1: result = result.T - return self._reindex_output(result).convert_objects() + return self._reindex_output(result).convert_objects(datetime=True) def _wrap_agged_blocks(self, items, blocks): if not self.as_index: @@ -3264,7 +3272,7 @@ def _wrap_agged_blocks(self, items, blocks): if self.axis == 1: result = result.T - return self._reindex_output(result).convert_objects() + return self._reindex_output(result).convert_objects(datetime=True) def _reindex_output(self, result): """ diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 42d7163e7f741..360f0ca4685a0 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -622,7 +622,7 @@ def _is_empty_indexer(indexer): # may have to soft convert_objects here if block.is_object and not self.is_object: - block = block.convert(convert_numeric=False) + block = block.convert(numeric=False) return block except (ValueError, TypeError) as detail: @@ -1455,7 +1455,7 @@ def is_bool(self): """ return lib.is_bool_array(self.values.ravel()) - def convert(self, convert_dates=True, convert_numeric=True, convert_timedeltas=True, + def convert(self, datetime=True, numeric=True, timedelta=True, coerce=False, copy=True, by_item=True): """ attempt to coerce any object types to better types return a copy of the block (if copy = True) @@ -1472,9 +1472,11 @@ def convert(self, convert_dates=True, convert_numeric=True, convert_timedeltas=T values = self.iget(i) values = com._possibly_convert_objects( - values.ravel(), convert_dates=convert_dates, - convert_numeric=convert_numeric, - convert_timedeltas=convert_timedeltas, + values.ravel(), + datetime=datetime, + numeric=numeric, + timedelta=timedelta, + coerce=coerce ).reshape(values.shape) values = _block_shape(values, ndim=self.ndim) newb = make_block(values, @@ -1484,8 +1486,11 @@ def convert(self, convert_dates=True, convert_numeric=True, convert_timedeltas=T else: values = com._possibly_convert_objects( - self.values.ravel(), convert_dates=convert_dates, - convert_numeric=convert_numeric + self.values.ravel(), + datetime=datetime, + numeric=numeric, + timedelta=timedelta, + coerce=coerce ).reshape(self.values.shape) blocks.append(make_block(values, ndim=self.ndim, placement=self.mgr_locs)) @@ -1529,8 +1534,8 @@ def _maybe_downcast(self, blocks, downcast=None): # split and convert the blocks result_blocks = [] for blk in blocks: - result_blocks.extend(blk.convert(convert_dates=True, - convert_numeric=False)) + result_blocks.extend(blk.convert(datetime=True, + numeric=False)) return result_blocks def _can_hold_element(self, element): diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index fca9e1c4e47ca..9093df9f0bf62 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -540,9 +540,11 @@ def try_remove_ws(x): 'Hamilton Bank, NA', 'The Citizens Savings Bank'] dfnew = df.applymap(try_remove_ws).replace(old, new) gtnew = ground_truth.applymap(try_remove_ws) - converted = dfnew.convert_objects(convert_numeric=True) - tm.assert_frame_equal(converted.convert_objects(convert_dates='coerce'), - gtnew) + converted = dfnew.convert_objects(datetime=True, numeric=True) + date_cols = ['Closing Date','Updated Date'] + converted[date_cols] = converted[date_cols].convert_objects(datetime=True, + coerce=True) + tm.assert_frame_equal(converted,gtnew) @slow def test_gold_canyon(self): diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 1b932fb3759e5..ea30fb14251f4 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -403,7 +403,7 @@ def test_repr(self): df['datetime1'] = datetime.datetime(2001,1,2,0,0) df['datetime2'] = datetime.datetime(2001,1,3,0,0) df.ix[3:6,['obj1']] = np.nan - df = df.consolidate().convert_objects() + df = df.consolidate().convert_objects(datetime=True) warnings.filterwarnings('ignore', category=PerformanceWarning) store['df'] = df @@ -728,7 +728,7 @@ def test_put_mixed_type(self): df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0) df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0) df.ix[3:6, ['obj1']] = np.nan - df = df.consolidate().convert_objects() + df = df.consolidate().convert_objects(datetime=True) with ensure_clean_store(self.path) as store: _maybe_remove(store, 'df') @@ -1381,7 +1381,7 @@ def check_col(key,name,size): df_dc.ix[7:9, 'string'] = 'bar' df_dc['string2'] = 'cool' df_dc['datetime'] = Timestamp('20010102') - df_dc = df_dc.convert_objects() + df_dc = df_dc.convert_objects(datetime=True) df_dc.ix[3:5, ['A', 'B', 'datetime']] = np.nan _maybe_remove(store, 'df_dc') @@ -1843,7 +1843,7 @@ def test_table_mixed_dtypes(self): df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0) df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0) df.ix[3:6, ['obj1']] = np.nan - df = df.consolidate().convert_objects() + df = df.consolidate().convert_objects(datetime=True) with ensure_clean_store(self.path) as store: store.append('df1_mixed', df) @@ -1899,7 +1899,7 @@ def test_unimplemented_dtypes_table_columns(self): df['obj1'] = 'foo' df['obj2'] = 'bar' df['datetime1'] = datetime.date(2001, 1, 2) - df = df.consolidate().convert_objects() + df = df.consolidate().convert_objects(datetime=True) with ensure_clean_store(self.path) as store: # this fails because we have a date in the object block...... diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 97bbfb0edf92c..8eb60b13fcc81 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -383,7 +383,7 @@ def test_read_write_reread_dta14(self): expected = self.read_csv(self.csv14) cols = ['byte_', 'int_', 'long_', 'float_', 'double_'] for col in cols: - expected[col] = expected[col].convert_objects(convert_numeric=True) + expected[col] = expected[col].convert_objects(datetime=True, numeric=True) expected['float_'] = expected['float_'].astype(np.float32) expected['date_td'] = pd.to_datetime(expected['date_td'], coerce=True) diff --git a/pandas/io/wb.py b/pandas/io/wb.py index 7a9443c4b9ac6..fba4c72a51376 100644 --- a/pandas/io/wb.py +++ b/pandas/io/wb.py @@ -155,7 +155,7 @@ def download(country=['MX', 'CA', 'US'], indicator=['NY.GDP.MKTP.CD', 'NY.GNS.IC out = reduce(lambda x, y: x.merge(y, how='outer'), data) out = out.drop('iso_code', axis=1) out = out.set_index(['country', 'year']) - out = out.convert_objects(convert_numeric=True) + out = out.convert_objects(datetime=True, numeric=True) return out else: msg = "No indicators returned data." diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 4dea73a3a73a1..cc807aae2be49 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -6434,7 +6434,8 @@ def make_dtnat_arr(n,nnat=None): with ensure_clean('.csv') as pth: df=DataFrame(dict(a=s1,b=s2)) df.to_csv(pth,chunksize=chunksize) - recons = DataFrame.from_csv(pth).convert_objects('coerce') + recons = DataFrame.from_csv(pth).convert_objects(datetime=True, + coerce=True) assert_frame_equal(df, recons,check_names=False,check_less_precise=True) for ncols in [4]: @@ -7144,7 +7145,7 @@ def test_dtypes(self): def test_convert_objects(self): oops = self.mixed_frame.T.T - converted = oops.convert_objects() + converted = oops.convert_objects(datetime=True) assert_frame_equal(converted, self.mixed_frame) self.assertEqual(converted['A'].dtype, np.float64) @@ -7157,7 +7158,8 @@ def test_convert_objects(self): self.mixed_frame['J'] = '1.' self.mixed_frame['K'] = '1' self.mixed_frame.ix[0:5,['J','K']] = 'garbled' - converted = self.mixed_frame.convert_objects(convert_numeric=True) + converted = self.mixed_frame.convert_objects(datetime=True, + numeric=True) self.assertEqual(converted['H'].dtype, 'float64') self.assertEqual(converted['I'].dtype, 'int64') self.assertEqual(converted['J'].dtype, 'float64') @@ -7179,14 +7181,14 @@ def test_convert_objects(self): # mixed in a single column df = DataFrame(dict(s = Series([1, 'na', 3 ,4]))) - result = df.convert_objects(convert_numeric=True) + result = df.convert_objects(datetime=True, numeric=True) expected = DataFrame(dict(s = Series([1, np.nan, 3 ,4]))) assert_frame_equal(result, expected) def test_convert_objects_no_conversion(self): mixed1 = DataFrame( {'a': [1, 2, 3], 'b': [4.0, 5, 6], 'c': ['x', 'y', 'z']}) - mixed2 = mixed1.convert_objects() + mixed2 = mixed1.convert_objects(datetime=True) assert_frame_equal(mixed1, mixed2) def test_append_series_dict(self): @@ -10698,7 +10700,7 @@ def test_apply_convert_objects(self): 'F': np.random.randn(11)}) result = data.apply(lambda x: x, axis=1) - assert_frame_equal(result.convert_objects(), data) + assert_frame_equal(result.convert_objects(datetime=True), data) def test_apply_attach_name(self): result = self.frame.apply(lambda x: x.name) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index e2a447207db82..91902aae3c835 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -601,7 +601,7 @@ def f(grp): return grp.iloc[0] result = df.groupby('A').apply(f)[['C']] e = df.groupby('A').first()[['C']] - e.loc['Pony'] = np.nan + e.loc['Pony'] = pd.NaT assert_frame_equal(result,e) # scalar outputs diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py index b666fba274b70..624fa11ac908a 100644 --- a/pandas/tests/test_indexing.py +++ b/pandas/tests/test_indexing.py @@ -3060,7 +3060,8 @@ def test_astype_assignment(self): assert_frame_equal(df,expected) df = df_orig.copy() - df.iloc[:,0:2] = df.iloc[:,0:2].convert_objects(convert_numeric=True) + df.iloc[:,0:2] = df.iloc[:,0:2].convert_objects(datetime=True, + numeric=True) expected = DataFrame([[1,2,'3','.4',5,6.,'foo']],columns=list('ABCDEFG')) assert_frame_equal(df,expected) diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py index 36585abd1b98f..ef05b40827dfd 100644 --- a/pandas/tests/test_internals.py +++ b/pandas/tests/test_internals.py @@ -554,7 +554,7 @@ def _compare(old_mgr, new_mgr): mgr.set('a', np.array(['1'] * N, dtype=np.object_)) mgr.set('b', np.array(['2.'] * N, dtype=np.object_)) mgr.set('foo', np.array(['foo.'] * N, dtype=np.object_)) - new_mgr = mgr.convert(convert_numeric=True) + new_mgr = mgr.convert(numeric=True) self.assertEqual(new_mgr.get('a').dtype, np.int64) self.assertEqual(new_mgr.get('b').dtype, np.float64) self.assertEqual(new_mgr.get('foo').dtype, np.object_) @@ -566,7 +566,7 @@ def _compare(old_mgr, new_mgr): mgr.set('a', np.array(['1'] * N, dtype=np.object_)) mgr.set('b', np.array(['2.'] * N, dtype=np.object_)) mgr.set('foo', np.array(['foo.'] * N, dtype=np.object_)) - new_mgr = mgr.convert(convert_numeric=True) + new_mgr = mgr.convert(numeric=True) self.assertEqual(new_mgr.get('a').dtype, np.int64) self.assertEqual(new_mgr.get('b').dtype, np.float64) self.assertEqual(new_mgr.get('foo').dtype, np.object_) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index bc0aaee1b10b6..9cdc769dd7d74 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -1119,7 +1119,7 @@ def test_convert_objects(self): # GH 4937 p = Panel(dict(A = dict(a = ['1','1.0']))) expected = Panel(dict(A = dict(a = [1,1.0]))) - result = p.convert_objects(convert_numeric='force') + result = p.convert_objects(numeric=True, coerce=True) assert_panel_equal(result, expected) def test_dtypes(self): diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 075362e006206..2bec54028fa35 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -8,6 +8,7 @@ from inspect import getargspec from itertools import product, starmap from distutils.version import LooseVersion +import warnings import nose @@ -5912,39 +5913,105 @@ def test_apply_dont_convert_dtype(self): result = s.apply(f, convert_dtype=False) self.assertEqual(result.dtype, object) + # GH 10265 def test_convert_objects(self): + # Tests: All to nans, coerce, true + # Test coercion returns correct type + s = Series(['a', 'b', 'c']) + results = s.convert_objects(datetime=True, coerce=True) + expected = Series([lib.NaT] * 3) + assert_series_equal(results, expected) + + results = s.convert_objects(numeric=True, coerce=True) + expected = Series([np.nan] * 3) + assert_series_equal(results, expected) + + expected = Series([lib.NaT] * 3, dtype=np.dtype('m8[ns]')) + results = s.convert_objects(timedelta=True, coerce=True) + assert_series_equal(results, expected) + + dt = datetime(2001, 1, 1, 0, 0) + td = dt - datetime(2000, 1, 1, 0, 0) + + # Test coercion with mixed types + s = Series(['a', '3.1415', dt, td]) + results = s.convert_objects(datetime=True, coerce=True) + expected = Series([lib.NaT, lib.NaT, dt, lib.NaT]) + assert_series_equal(results, expected) + + results = s.convert_objects(numeric=True, coerce=True) + expected = Series([nan, 3.1415, nan, nan]) + assert_series_equal(results, expected) + + results = s.convert_objects(timedelta=True, coerce=True) + expected = Series([lib.NaT, lib.NaT, lib.NaT, td], + dtype=np.dtype('m8[ns]')) + assert_series_equal(results, expected) + + # Test standard conversion returns original + results = s.convert_objects(datetime=True) + assert_series_equal(results, s) + results = s.convert_objects(numeric=True) + expected = Series([nan, 3.1415, nan, nan]) + assert_series_equal(results, expected) + results = s.convert_objects(timedelta=True) + assert_series_equal(results, s) + + # test pass-through and non-conversion when other types selected + s = Series(['1.0','2.0','3.0']) + results = s.convert_objects(True,True,True) + expected = Series([1.0,2.0,3.0]) + assert_series_equal(results, expected) + results = s.convert_objects(True,False,True) + assert_series_equal(results, s) + + s = Series([datetime(2001, 1, 1, 0, 0),datetime(2001, 1, 1, 0, 0)], + dtype='O') + results = s.convert_objects(True,True,True) + expected = Series([datetime(2001, 1, 1, 0, 0),datetime(2001, 1, 1, 0, 0)]) + assert_series_equal(results, expected) + results = s.convert_objects(False,True,True) + assert_series_equal(results, s) + + td = datetime(2001, 1, 1, 0, 0) - datetime(2000, 1, 1, 0, 0) + s = Series([td, td], dtype='O') + results = s.convert_objects(True,True,True) + expected = Series([td, td]) + assert_series_equal(results, expected) + results = s.convert_objects(True,True,False) + assert_series_equal(results, s) + s = Series([1., 2, 3], index=['a', 'b', 'c']) - result = s.convert_objects(convert_dates=False, convert_numeric=True) + result = s.convert_objects(numeric=True) assert_series_equal(result, s) # force numeric conversion r = s.copy().astype('O') r['a'] = '1' - result = r.convert_objects(convert_dates=False, convert_numeric=True) + result = r.convert_objects(numeric=True) assert_series_equal(result, s) r = s.copy().astype('O') r['a'] = '1.' - result = r.convert_objects(convert_dates=False, convert_numeric=True) + result = r.convert_objects(numeric=True) assert_series_equal(result, s) r = s.copy().astype('O') r['a'] = 'garbled' + result = r.convert_objects(numeric=True) expected = s.copy() - expected['a'] = np.nan - result = r.convert_objects(convert_dates=False, convert_numeric=True) + expected['a'] = nan assert_series_equal(result, expected) # GH 4119, not converting a mixed type (e.g.floats and object) s = Series([1, 'na', 3, 4]) - result = s.convert_objects(convert_numeric=True) - expected = Series([1, np.nan, 3, 4]) + result = s.convert_objects(datetime=True, numeric=True) + expected = Series([1, nan, 3, 4]) assert_series_equal(result, expected) s = Series([1, '', 3, 4]) - result = s.convert_objects(convert_numeric=True) - expected = Series([1, np.nan, 3, 4]) + result = s.convert_objects(datetime=True, numeric=True) assert_series_equal(result, expected) # dates @@ -5953,38 +6020,34 @@ def test_convert_objects(self): s2 = Series([datetime(2001, 1, 1, 0, 0), datetime(2001, 1, 2, 0, 0), datetime( 2001, 1, 3, 0, 0), 'foo', 1.0, 1, Timestamp('20010104'), '20010105'], dtype='O') - result = s.convert_objects(convert_dates=True, convert_numeric=False) + result = s.convert_objects(datetime=True) expected = Series( [Timestamp('20010101'), Timestamp('20010102'), Timestamp('20010103')], dtype='M8[ns]') assert_series_equal(result, expected) - result = s.convert_objects( - convert_dates='coerce', convert_numeric=False) - result = s.convert_objects( - convert_dates='coerce', convert_numeric=True) + result = s.convert_objects(datetime=True, coerce=True) assert_series_equal(result, expected) expected = Series( [Timestamp( '20010101'), Timestamp('20010102'), Timestamp('20010103'), lib.NaT, lib.NaT, lib.NaT, Timestamp('20010104'), Timestamp('20010105')], dtype='M8[ns]') - result = s2.convert_objects( - convert_dates='coerce', convert_numeric=False) + result = s2.convert_objects(datetime=True, + numeric=False, + timedelta=False, + coerce=True) assert_series_equal(result, expected) - result = s2.convert_objects( - convert_dates='coerce', convert_numeric=True) + result = s2.convert_objects(datetime=True, coerce=True) assert_series_equal(result, expected) - # preserver all-nans (if convert_dates='coerce') s = Series(['foo', 'bar', 1, 1.0], dtype='O') - result = s.convert_objects( - convert_dates='coerce', convert_numeric=False) - assert_series_equal(result, s) + result = s.convert_objects(datetime=True, coerce=True) + expected = Series([lib.NaT]*4) + assert_series_equal(result, expected) # preserver if non-object s = Series([1], dtype='float32') - result = s.convert_objects( - convert_dates='coerce', convert_numeric=False) + result = s.convert_objects(datetime=True, coerce=True) assert_series_equal(result, s) #r = s.copy() @@ -5993,23 +6056,31 @@ def test_convert_objects(self): #self.assertEqual(result.dtype, 'M8[ns]') # dateutil parses some single letters into today's value as a date + expected = Series([lib.NaT]) for x in 'abcdefghijklmnopqrstuvwxyz': s = Series([x]) - result = s.convert_objects(convert_dates='coerce') - assert_series_equal(result, s) + result = s.convert_objects(datetime=True, coerce=True) + assert_series_equal(result, expected) s = Series([x.upper()]) - result = s.convert_objects(convert_dates='coerce') - assert_series_equal(result, s) + result = s.convert_objects(datetime=True, coerce=True) + assert_series_equal(result, expected) + + def test_convert_objects_no_arg_warning(self): + s = Series(['1.0','2']) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always', RuntimeWarning) + s.convert_objects() + self.assertEqual(len(w), 1) def test_convert_objects_preserve_bool(self): s = Series([1, True, 3, 5], dtype=object) - r = s.convert_objects(convert_numeric=True) + r = s.convert_objects(datetime=True, numeric=True) e = Series([1, 1, 3, 5], dtype='i8') tm.assert_series_equal(r, e) def test_convert_objects_preserve_all_bool(self): s = Series([False, True, False, False], dtype=object) - r = s.convert_objects(convert_numeric=True) + r = s.convert_objects(datetime=True, numeric=True) e = Series([False, True, False, False], dtype=bool) tm.assert_series_equal(r, e) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 07d7ced02e6ba..54298e8434a1b 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -1030,7 +1030,7 @@ def _compute_plot_data(self): label = 'None' data = data.to_frame(name=label) - numeric_data = data.convert_objects()._get_numeric_data() + numeric_data = data.convert_objects(datetime=True)._get_numeric_data() try: is_empty = numeric_data.empty @@ -1960,7 +1960,8 @@ def __init__(self, data, bins=10, bottom=0, **kwargs): def _args_adjust(self): if com.is_integer(self.bins): # create common bin edge - values = self.data.convert_objects()._get_numeric_data() + values = (self.data.convert_objects(datetime=True) + ._get_numeric_data()) values = np.ravel(values) values = values[~com.isnull(values)] diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 27cd5e89220a9..9bb7d7261a8df 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -2416,6 +2416,8 @@ cdef inline parse_timedelta_string(object ts, coerce=False): elif have_dot: if (len(number) or len(frac)) and not len(unit) and current_unit is None: + if coerce: + return iNaT raise ValueError("no units specified") if len(frac) > 0 and len(frac) <= 3: