From 9e7666dae3b3b10d987ce154a51c78bcee6e0728 Mon Sep 17 00:00:00 2001 From: chris-b1 Date: Tue, 18 Jul 2017 06:26:44 -0500 Subject: [PATCH] API: add infer_objects for soft conversions (#16915) * API: add infer_objects for soft conversions * doc fixups * fixups * doc --- doc/source/api.rst | 2 + doc/source/basics.rst | 23 ++++++++- doc/source/whatsnew/v0.21.0.txt | 32 +++++++++++++ pandas/core/generic.py | 56 ++++++++++++++++++++-- pandas/tests/frame/test_block_internals.py | 26 ++++++++++ pandas/tests/series/test_dtypes.py | 18 +++++++ 6 files changed, 153 insertions(+), 4 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index d6053791d6f4b..77d095a965221 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -270,6 +270,7 @@ Conversion :toctree: generated/ Series.astype + Series.infer_objects Series.copy Series.isnull Series.notnull @@ -777,6 +778,7 @@ Conversion DataFrame.astype DataFrame.convert_objects + DataFrame.infer_objects DataFrame.copy DataFrame.isnull DataFrame.notnull diff --git a/doc/source/basics.rst b/doc/source/basics.rst index d8b1602fb104d..4211b15203721 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -2024,7 +2024,28 @@ object conversion ~~~~~~~~~~~~~~~~~ pandas offers various functions to try to force conversion of types from the ``object`` dtype to other types. -The following functions are available for one dimensional object arrays or scalars: +In cases where the data is already of the correct type, but stored in an ``object`` array, the +:meth:`~DataFrame.infer_objects` and :meth:`~Series.infer_objects` can be used to soft convert +to the correct type. + + .. ipython:: python + + df = pd.DataFrame([[1, 2], + ['a', 'b'], + [datetime.datetime(2016, 3, 2), datetime.datetime(2016, 3, 2)]]) + df = df.T + df + df.dtypes + +Because the data transposed the original inference stored all columns as object, which +``infer_objects`` will correct. + + .. ipython:: python + + df.infer_objects().dtypes + +The following functions are available for one dimensional object arrays or scalars to perform +hard conversion of objects to a specified type: - :meth:`~pandas.to_numeric` (conversion to numeric dtypes) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index c63d4575bac43..cba3691b25ab1 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -25,6 +25,38 @@ New features - Added ``__fspath__`` method to :class:`~pandas.HDFStore`, :class:`~pandas.ExcelFile`, and :class:`~pandas.ExcelWriter` to work properly with the file system path protocol (:issue:`13823`) + +.. _whatsnew_0210.enhancements.infer_objects: + +``infer_objects`` type conversion +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The `:meth:`~DataFrame.infer_objects` and :meth:`~Series.infer_objects` +methods have been added to perform dtype inference on object columns, replacing +some of the functionality of the deprecated ``convert_objects`` +method. See the documentation :ref:`here ` +for more details. (:issue:`11221`) + +This function only performs soft conversions on object columns, converting Python objects +to native types, but not any coercive conversions. For example: + +.. ipython:: python + + df = pd.DataFrame({'A': [1, 2, 3], + 'B': np.array([1, 2, 3], dtype='object'), + 'C': ['1', '2', '3']}) + df.dtypes + df.infer_objects().dtype + +Note that column ``'C'`` was not converted - only scalar numeric types +will be inferred to a new type. Other types of conversion should be accomplished +using :func:`to_numeric` function (or :func:`to_datetime`, :func:`to_timedelta`). +.. ipython:: python + + df = df.infer_objects() + df['C'] = pd.to_numeric(df['C'], errors='coerce') + df.dtypes + .. _whatsnew_0210.enhancements.other: Other Enhancements diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f12592feaa4c3..c95129bdaa005 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3671,9 +3671,12 @@ def convert_objects(self, convert_dates=True, convert_numeric=False, converted : same as input object """ from warnings import warn - warn("convert_objects is deprecated. Use the data-type specific " - "converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.", - FutureWarning, stacklevel=2) + msg = ("convert_objects is deprecated. To re-infer data dtypes for " + "object columns, use {klass}.infer_objects()\nFor all " + "other conversions use the data-type specific converters " + "pd.to_datetime, pd.to_timedelta and pd.to_numeric." + ).format(klass=self.__class__.__name__) + warn(msg, FutureWarning, stacklevel=2) return self._constructor( self._data.convert(convert_dates=convert_dates, @@ -3681,6 +3684,53 @@ def convert_objects(self, convert_dates=True, convert_numeric=False, convert_timedeltas=convert_timedeltas, copy=copy)).__finalize__(self) + def infer_objects(self): + """ + Attempt to infer better dtypes for object columns. + + Attempts soft conversion of object-dtyped + columns, leaving non-object and unconvertible + columns unchanged. The inference rules are the + same as during normal Series/DataFrame construction. + + .. versionadded:: 0.20.0 + + See Also + -------- + pandas.to_datetime : Convert argument to datetime. + pandas.to_timedelta : Convert argument to timedelta. + pandas.to_numeric : Convert argument to numeric typeR + + Returns + ------- + converted : same type as input object + + Examples + -------- + >>> df = pd.DataFrame({"A": ["a", 1, 2, 3]}) + >>> df = df.iloc[1:] + >>> df + A + 1 1 + 2 2 + 3 3 + + >>> df.dtypes + A object + dtype: object + + >>> df.infer_objects().dtypes + A int64 + dtype: object + """ + # numeric=False necessary to only soft convert; + # python objects will still be converted to + # native numpy numeric types + return self._constructor( + self._data.convert(datetime=True, numeric=False, + timedelta=True, coerce=False, + copy=True)).__finalize__(self) + # ---------------------------------------------------------------------- # Filling NA's diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index c1a5b437be5d0..f66070fd66813 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -495,6 +495,32 @@ def test_convert_objects_no_conversion(self): mixed2 = mixed1._convert(datetime=True) assert_frame_equal(mixed1, mixed2) + def test_infer_objects(self): + # GH 11221 + df = DataFrame({'a': ['a', 1, 2, 3], + 'b': ['b', 2.0, 3.0, 4.1], + 'c': ['c', datetime(2016, 1, 1), + datetime(2016, 1, 2), + datetime(2016, 1, 3)], + 'd': [1, 2, 3, 'd']}, + columns=['a', 'b', 'c', 'd']) + df = df.iloc[1:].infer_objects() + + assert df['a'].dtype == 'int64' + assert df['b'].dtype == 'float64' + assert df['c'].dtype == 'M8[ns]' + assert df['d'].dtype == 'object' + + expected = DataFrame({'a': [1, 2, 3], + 'b': [2.0, 3.0, 4.1], + 'c': [datetime(2016, 1, 1), + datetime(2016, 1, 2), + datetime(2016, 1, 3)], + 'd': [2, 3, 'd']}, + columns=['a', 'b', 'c', 'd']) + # reconstruct frame to verify inference is same + tm.assert_frame_equal(df.reset_index(drop=True), expected) + def test_stale_cached_series_bug_473(self): # this is chained, but ok diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 2ec579842e33f..c214280ee8386 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -268,3 +268,21 @@ def test_series_to_categorical(self): expected = Series(['a', 'b', 'c'], dtype='category') tm.assert_series_equal(result, expected) + + def test_infer_objects_series(self): + # GH 11221 + actual = Series(np.array([1, 2, 3], dtype='O')).infer_objects() + expected = Series([1, 2, 3]) + tm.assert_series_equal(actual, expected) + + actual = Series(np.array([1, 2, 3, None], dtype='O')).infer_objects() + expected = Series([1., 2., 3., np.nan]) + tm.assert_series_equal(actual, expected) + + # only soft conversions, uncovertable pass thru unchanged + actual = (Series(np.array([1, 2, 3, None, 'a'], dtype='O')) + .infer_objects()) + expected = Series([1, 2, 3, None, 'a']) + + assert actual.dtype == 'object' + tm.assert_series_equal(actual, expected)