DEPR: deprecate default of skipna=False in infer_dtype (#24050)

pandas-dev · Jan 4, 2019 · 2f842f2 · 2f842f2
1 parent c5166b6
commit 2f842f2
Show file tree

Hide file tree

Showing 6 changed files with 73 additions and 58 deletions.
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -1220,6 +1220,7 @@ Deprecations
 - :func:`pandas.api.types.is_datetimetz` is deprecated in favor of `pandas.api.types.is_datetime64tz` (:issue:`23917`)
 - Creating a :class:`TimedeltaIndex`, :class:`DatetimeIndex`, or :class:`PeriodIndex` by passing range arguments `start`, `end`, and `periods` is deprecated in favor of :func:`timedelta_range`, :func:`date_range`, or :func:`period_range` (:issue:`23919`)
 - Passing a string alias like ``'datetime64[ns, UTC]'`` as the ``unit`` parameter to :class:`DatetimeTZDtype` is deprecated. Use :class:`DatetimeTZDtype.construct_from_string` instead (:issue:`23990`).
+- The ``skipna`` parameter of :meth:`~pandas.api.types.infer_dtype` will switch to ``True`` by default in a future version of pandas (:issue:`17066`, :issue:`24050`)
 - In :meth:`Series.where` with Categorical data, providing an ``other`` that is not present in the categories is deprecated. Convert the categorical to a different dtype or add the ``other`` to the categories first (:issue:`24077`).
 - :meth:`Series.clip_lower`, :meth:`Series.clip_upper`, :meth:`DataFrame.clip_lower` and :meth:`DataFrame.clip_upper` are deprecated and will be removed in a future version. Use ``Series.clip(lower=threshold)``, ``Series.clip(upper=threshold)`` and the equivalent ``DataFrame`` methods (:issue:`24203`)
 

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -4,6 +4,7 @@ from fractions import Fraction
 from numbers import Number
 
 import sys
+import warnings
 
 import cython
 from cython import Py_ssize_t
@@ -1079,7 +1080,7 @@ cdef _try_infer_map(v):
     return None
 
 
-def infer_dtype(value: object, skipna: bool=False) -> str:
+def infer_dtype(value: object, skipna: object=None) -> str:
     """
     Efficiently infer the type of a passed val, or list-like
     array of values. Return a string describing the type.
@@ -1088,8 +1089,7 @@ def infer_dtype(value: object, skipna: bool=False) -> str:
     ----------
     value : scalar, list, ndarray, or pandas type
     skipna : bool, default False
-        Ignore NaN values when inferring the type. The default of ``False``
-        will be deprecated in a later version of pandas.
+        Ignore NaN values when inferring the type.
 
         .. versionadded:: 0.21.0
 
@@ -1186,6 +1186,12 @@ def infer_dtype(value: object, skipna: bool=False) -> str:
         bint seen_pdnat = False
         bint seen_val = False
 
+    if skipna is None:
+        msg = ('A future version of pandas will default to `skipna=True`. To '
+               'silence this warning, pass `skipna=True|False` explicitly.')
+        warnings.warn(msg, FutureWarning, stacklevel=2)
+        skipna = False
+
     if util.is_array(value):
         values = value
     elif hasattr(value, 'dtype'):

diff --git a/pandas/core/arrays/array_.py b/pandas/core/arrays/array_.py
@@ -209,7 +209,7 @@ def array(data,         # type: Sequence[object]
         return cls._from_sequence(data, dtype=dtype, copy=copy)
 
     if dtype is None:
-        inferred_dtype = lib.infer_dtype(data)
+        inferred_dtype = lib.infer_dtype(data, skipna=False)
         if inferred_dtype == 'period':
             try:
                 return period_array(data, copy=copy)

diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
@@ -962,8 +962,8 @@ def _maybe_coerce_merge_keys(self):
             # object values are allowed to be merged
             elif ((lk_is_object and is_numeric_dtype(rk)) or
                   (is_numeric_dtype(lk) and rk_is_object)):
-                inferred_left = lib.infer_dtype(lk)
-                inferred_right = lib.infer_dtype(rk)
+                inferred_left = lib.infer_dtype(lk, skipna=False)
+                inferred_right = lib.infer_dtype(rk, skipna=False)
                 bool_types = ['integer', 'mixed-integer', 'boolean', 'empty']
                 string_types = ['string', 'unicode', 'mixed', 'bytes', 'empty']
 

diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
@@ -334,11 +334,11 @@ def test_infer_dtype_bytes(self):
 
         # string array of bytes
         arr = np.array(list('abc'), dtype='S1')
-        assert lib.infer_dtype(arr, skipna=False) == compare
+        assert lib.infer_dtype(arr, skipna=True) == compare
 
         # object array of bytes
         arr = arr.astype(object)
-        assert lib.infer_dtype(arr, skipna=False) == compare
+        assert lib.infer_dtype(arr, skipna=True) == compare
 
         # object array of bytes with missing values
         assert lib.infer_dtype([b'a', np.nan, b'c'], skipna=True) == compare
@@ -538,32 +538,40 @@ def test_length_zero(self, skipna):
 
     def test_integers(self):
         arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype='O')
-        result = lib.infer_dtype(arr, skipna=False)
+        result = lib.infer_dtype(arr, skipna=True)
         assert result == 'integer'
 
         arr = np.array([1, 2, 3, np.int64(4), np.int32(5), 'foo'], dtype='O')
-        result = lib.infer_dtype(arr, skipna=False)
+        result = lib.infer_dtype(arr, skipna=True)
         assert result == 'mixed-integer'
 
         arr = np.array([1, 2, 3, 4, 5], dtype='i4')
-        result = lib.infer_dtype(arr, skipna=False)
+        result = lib.infer_dtype(arr, skipna=True)
         assert result == 'integer'
 
+    def test_deprecation(self):
+        # GH 24050
+        arr = np.array([1, 2, 3], dtype=object)
+
+        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+            result = lib.infer_dtype(arr)  # default: skipna=None -> warn
+            assert result == 'integer'
+
     def test_bools(self):
         arr = np.array([True, False, True, True, True], dtype='O')
-        result = lib.infer_dtype(arr, skipna=False)
+        result = lib.infer_dtype(arr, skipna=True)
         assert result == 'boolean'
 
         arr = np.array([np.bool_(True), np.bool_(False)], dtype='O')
-        result = lib.infer_dtype(arr, skipna=False)
+        result = lib.infer_dtype(arr, skipna=True)
         assert result == 'boolean'
 
         arr = np.array([True, False, True, 'foo'], dtype='O')
-        result = lib.infer_dtype(arr, skipna=False)
+        result = lib.infer_dtype(arr, skipna=True)
         assert result == 'mixed'
 
         arr = np.array([True, False, True], dtype=bool)
-        result = lib.infer_dtype(arr, skipna=False)
+        result = lib.infer_dtype(arr, skipna=True)
         assert result == 'boolean'
 
         arr = np.array([True, np.nan, False], dtype='O')
@@ -575,38 +583,38 @@ def test_bools(self):
 
     def test_floats(self):
         arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O')
-        result = lib.infer_dtype(arr, skipna=False)
+        result = lib.infer_dtype(arr, skipna=True)
         assert result == 'floating'
 
         arr = np.array([1, 2, 3, np.float64(4), np.float32(5), 'foo'],
                        dtype='O')
-        result = lib.infer_dtype(arr, skipna=False)
+        result = lib.infer_dtype(arr, skipna=True)
         assert result == 'mixed-integer'
 
         arr = np.array([1, 2, 3, 4, 5], dtype='f4')
-        result = lib.infer_dtype(arr, skipna=False)
+        result = lib.infer_dtype(arr, skipna=True)
         assert result == 'floating'
 
         arr = np.array([1, 2, 3, 4, 5], dtype='f8')
-        result = lib.infer_dtype(arr, skipna=False)
+        result = lib.infer_dtype(arr, skipna=True)
         assert result == 'floating'
 
     def test_decimals(self):
         # GH15690
         arr = np.array([Decimal(1), Decimal(2), Decimal(3)])
-        result = lib.infer_dtype(arr, skipna=False)
+        result = lib.infer_dtype(arr, skipna=True)
         assert result == 'decimal'
 
         arr = np.array([1.0, 2.0, Decimal(3)])
-        result = lib.infer_dtype(arr, skipna=False)
+        result = lib.infer_dtype(arr, skipna=True)
         assert result == 'mixed'
 
         arr = np.array([Decimal(1), Decimal('NaN'), Decimal(3)])
-        result = lib.infer_dtype(arr, skipna=False)
+        result = lib.infer_dtype(arr, skipna=True)
         assert result == 'decimal'
 
         arr = np.array([Decimal(1), np.nan, Decimal(3)], dtype='O')
-        result = lib.infer_dtype(arr, skipna=False)
+        result = lib.infer_dtype(arr, skipna=True)
         assert result == 'decimal'
 
     def test_string(self):
@@ -648,34 +656,34 @@ def test_infer_dtype_datetime(self):
 
         arr = np.array([Timestamp('2011-01-01'),
                         Timestamp('2011-01-02')])
-        assert lib.infer_dtype(arr, skipna=False) == 'datetime'
+        assert lib.infer_dtype(arr, skipna=True) == 'datetime'
 
         arr = np.array([np.datetime64('2011-01-01'),
                         np.datetime64('2011-01-01')], dtype=object)
-        assert lib.infer_dtype(arr, skipna=False) == 'datetime64'
+        assert lib.infer_dtype(arr, skipna=True) == 'datetime64'
 
         arr = np.array([datetime(2011, 1, 1), datetime(2012, 2, 1)])
-        assert lib.infer_dtype(arr, skipna=False) == 'datetime'
+        assert lib.infer_dtype(arr, skipna=True) == 'datetime'
 
         # starts with nan
         for n in [pd.NaT, np.nan]:
             arr = np.array([n, pd.Timestamp('2011-01-02')])
-            assert lib.infer_dtype(arr, skipna=False) == 'datetime'
+            assert lib.infer_dtype(arr, skipna=True) == 'datetime'
 
             arr = np.array([n, np.datetime64('2011-01-02')])
-            assert lib.infer_dtype(arr, skipna=False) == 'datetime64'
+            assert lib.infer_dtype(arr, skipna=True) == 'datetime64'
 
             arr = np.array([n, datetime(2011, 1, 1)])
-            assert lib.infer_dtype(arr, skipna=False) == 'datetime'
+            assert lib.infer_dtype(arr, skipna=True) == 'datetime'
 
             arr = np.array([n, pd.Timestamp('2011-01-02'), n])
-            assert lib.infer_dtype(arr, skipna=False) == 'datetime'
+            assert lib.infer_dtype(arr, skipna=True) == 'datetime'
 
             arr = np.array([n, np.datetime64('2011-01-02'), n])
-            assert lib.infer_dtype(arr, skipna=False) == 'datetime64'
+            assert lib.infer_dtype(arr, skipna=True) == 'datetime64'
 
             arr = np.array([n, datetime(2011, 1, 1), n])
-            assert lib.infer_dtype(arr, skipna=False) == 'datetime'
+            assert lib.infer_dtype(arr, skipna=True) == 'datetime'
 
         # different type of nat
         arr = np.array([np.timedelta64('nat'),
@@ -689,58 +697,58 @@ def test_infer_dtype_datetime(self):
         # mixed datetime
         arr = np.array([datetime(2011, 1, 1),
                         pd.Timestamp('2011-01-02')])
-        assert lib.infer_dtype(arr, skipna=False) == 'datetime'
+        assert lib.infer_dtype(arr, skipna=True) == 'datetime'
 
         # should be datetime?
         arr = np.array([np.datetime64('2011-01-01'),
                         pd.Timestamp('2011-01-02')])
-        assert lib.infer_dtype(arr, skipna=False) == 'mixed'
+        assert lib.infer_dtype(arr, skipna=True) == 'mixed'
 
         arr = np.array([pd.Timestamp('2011-01-02'),
                         np.datetime64('2011-01-01')])
-        assert lib.infer_dtype(arr, skipna=False) == 'mixed'
+        assert lib.infer_dtype(arr, skipna=True) == 'mixed'
 
         arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1])
-        assert lib.infer_dtype(arr, skipna=False) == 'mixed-integer'
+        assert lib.infer_dtype(arr, skipna=True) == 'mixed-integer'
 
         arr = np.array([np.nan, pd.Timestamp('2011-01-02'), 1.1])
-        assert lib.infer_dtype(arr, skipna=False) == 'mixed'
+        assert lib.infer_dtype(arr, skipna=True) == 'mixed'
 
         arr = np.array([np.nan, '2011-01-01', pd.Timestamp('2011-01-02')])
-        assert lib.infer_dtype(arr, skipna=False) == 'mixed'
+        assert lib.infer_dtype(arr, skipna=True) == 'mixed'
 
     def test_infer_dtype_timedelta(self):
 
         arr = np.array([pd.Timedelta('1 days'),
                         pd.Timedelta('2 days')])
-        assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
+        assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
 
         arr = np.array([np.timedelta64(1, 'D'),
                         np.timedelta64(2, 'D')], dtype=object)
-        assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
+        assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
 
         arr = np.array([timedelta(1), timedelta(2)])
-        assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
+        assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
 
         # starts with nan
         for n in [pd.NaT, np.nan]:
             arr = np.array([n, Timedelta('1 days')])
-            assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
+            assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
 
             arr = np.array([n, np.timedelta64(1, 'D')])
-            assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
+            assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
 
             arr = np.array([n, timedelta(1)])
-            assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
+            assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
 
             arr = np.array([n, pd.Timedelta('1 days'), n])
-            assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
+            assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
 
             arr = np.array([n, np.timedelta64(1, 'D'), n])
-            assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
+            assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
 
             arr = np.array([n, timedelta(1), n])
-            assert lib.infer_dtype(arr, skipna=False) == 'timedelta'
+            assert lib.infer_dtype(arr, skipna=True) == 'timedelta'
 
         # different type of nat
         arr = np.array([np.datetime64('nat'), np.timedelta64(1, 'D')],
@@ -755,19 +763,19 @@ def test_infer_dtype_period(self):
         # GH 13664
         arr = np.array([pd.Period('2011-01', freq='D'),
                         pd.Period('2011-02', freq='D')])
-        assert lib.infer_dtype(arr, skipna=False) == 'period'
+        assert lib.infer_dtype(arr, skipna=True) == 'period'
 
         arr = np.array([pd.Period('2011-01', freq='D'),
                         pd.Period('2011-02', freq='M')])
-        assert lib.infer_dtype(arr, skipna=False) == 'period'
+        assert lib.infer_dtype(arr, skipna=True) == 'period'
 
         # starts with nan
         for n in [pd.NaT, np.nan]:
             arr = np.array([n, pd.Period('2011-01', freq='D')])
-            assert lib.infer_dtype(arr, skipna=False) == 'period'
+            assert lib.infer_dtype(arr, skipna=True) == 'period'
 
             arr = np.array([n, pd.Period('2011-01', freq='D'), n])
-            assert lib.infer_dtype(arr, skipna=False) == 'period'
+            assert lib.infer_dtype(arr, skipna=True) == 'period'
 
         # different type of nat
         arr = np.array([np.datetime64('nat'), pd.Period('2011-01', freq='M')],
@@ -846,7 +854,7 @@ def test_infer_datetimelike_array_nan_nat_like(self, first, second,
 
     def test_infer_dtype_all_nan_nat_like(self):
         arr = np.array([np.nan, np.nan])
-        assert lib.infer_dtype(arr, skipna=False) == 'floating'
+        assert lib.infer_dtype(arr, skipna=True) == 'floating'
 
         # nan and None mix are result in mixed
         arr = np.array([np.nan, np.nan, None])
@@ -1043,17 +1051,17 @@ def test_categorical(self):
         # GH 8974
         from pandas import Categorical, Series
         arr = Categorical(list('abc'))
-        result = lib.infer_dtype(arr, skipna=False)
+        result = lib.infer_dtype(arr, skipna=True)
         assert result == 'categorical'
 
-        result = lib.infer_dtype(Series(arr), skipna=False)
+        result = lib.infer_dtype(Series(arr), skipna=True)
         assert result == 'categorical'
 
         arr = Categorical(list('abc'), categories=['cegfab'], ordered=True)
-        result = lib.infer_dtype(arr, skipna=False)
+        result = lib.infer_dtype(arr, skipna=True)
         assert result == 'categorical'
 
-        result = lib.infer_dtype(Series(arr), skipna=False)
+        result = lib.infer_dtype(Series(arr), skipna=True)
         assert result == 'categorical'
 
 

diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
@@ -813,12 +813,12 @@ def test_constructor_with_datetime_tz(self):
         s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'),
                     pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')])
         assert s.dtype == 'datetime64[ns, US/Pacific]'
-        assert lib.infer_dtype(s, skipna=False) == 'datetime64'
+        assert lib.infer_dtype(s, skipna=True) == 'datetime64'
 
         s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'),
                     pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Eastern')])
         assert s.dtype == 'object'
-        assert lib.infer_dtype(s, skipna=False) == 'datetime'
+        assert lib.infer_dtype(s, skipna=True) == 'datetime'
 
         # with all NaT
         s = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]')