Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEPR: deprecate default of skipna=False in infer_dtype #24050

Merged
merged 19 commits into from
Jan 4, 2019
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1143,7 +1143,8 @@ Deprecations
- :func:`pandas.api.types.is_period` is deprecated in favor of `pandas.api.types.is_period_dtype` (:issue:`23917`)
- :func:`pandas.api.types.is_datetimetz` is deprecated in favor of `pandas.api.types.is_datetime64tz` (:issue:`23917`)
- Creating a :class:`TimedeltaIndex` or :class:`DatetimeIndex` by passing range arguments `start`, `end`, and `periods` is deprecated in favor of :func:`timedelta_range` and :func:`date_range` (:issue:`23919`)
- Passing a string alias like ``'datetime64[ns, UTC]'`` as the ``unit`` parameter to :class:`DatetimeTZDtype` is deprecated. Use :class:`DatetimeTZDtype.construct_from_string` instead (:issue:`23990`).
- Passing a string alias like ``'datetime64[ns, UTC]'`` as the `unit` parameter to :class:`DatetimeTZDtype` is deprecated. Use :class:`DatetimeTZDtype.construct_from_string` instead (:issue:`23990`).
- The ``skipna`` parameter of :meth:`~pandas.api.types.infer_dtype` will switch to ``True`` by default in a future version of pandas (:issue:`17066`, :issue:`24050`)
- In :meth:`Series.where` with Categorical data, providing an ``other`` that is not present in the categories is deprecated. Convert the categorical to a different dtype or add the ``other`` to the categories first (:issue:`24077`).
- :meth:`Series.clip_lower`, :meth:`Series.clip_upper`, :meth:`DataFrame.clip_lower` and :meth:`DataFrame.clip_upper` are deprecated and will be removed in a future version. Use ``Series.clip(lower=threshold)``, ``Series.clip(upper=threshold)`` and the equivalent ``DataFrame`` methods (:issue:`24203`)

Expand Down
21 changes: 14 additions & 7 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ from fractions import Fraction
from numbers import Number

import sys
import warnings

import cython
from cython import Py_ssize_t
Expand Down Expand Up @@ -622,7 +623,7 @@ def clean_index_list(obj: list):
return obj, all_arrays

# don't force numpy coerce with nan's
inferred = infer_dtype(obj)
inferred = infer_dtype(obj, skipna=False)
jreback marked this conversation as resolved.
Show resolved Hide resolved
if inferred in ['string', 'bytes', 'unicode', 'mixed', 'mixed-integer']:
return np.asarray(obj, dtype=object), 0
elif inferred in ['integer']:
Expand Down Expand Up @@ -1078,7 +1079,7 @@ cdef _try_infer_map(v):
return None


def infer_dtype(value: object, skipna: bool=False) -> str:
def infer_dtype(value: object, skipna: object=None) -> str:
"""
Efficiently infer the type of a passed val, or list-like
array of values. Return a string describing the type.
Expand All @@ -1087,8 +1088,7 @@ def infer_dtype(value: object, skipna: bool=False) -> str:
----------
value : scalar, list, ndarray, or pandas type
skipna : bool, default False
Ignore NaN values when inferring the type. The default of ``False``
will be deprecated in a later version of pandas.
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
Ignore NaN values when inferring the type.
jreback marked this conversation as resolved.
Show resolved Hide resolved

.. versionadded:: 0.21.0

Expand Down Expand Up @@ -1185,6 +1185,12 @@ def infer_dtype(value: object, skipna: bool=False) -> str:
bint seen_pdnat = False
bint seen_val = False

if skipna is None:
msg = ('A future version of pandas will default to `skipna=True`. To '
'silence this warning, pass `skipna=True|False` explicitly.')
warnings.warn(msg, FutureWarning, stacklevel=2)
skipna = False

if util.is_array(value):
values = value
elif hasattr(value, 'dtype'):
Expand All @@ -1209,6 +1215,10 @@ def infer_dtype(value: object, skipna: bool=False) -> str:
values = construct_1d_object_array_from_listlike(value)

values = getattr(values, 'values', values)

jreback marked this conversation as resolved.
Show resolved Hide resolved
# make contiguous
values = values.ravel()

if skipna:
values = values[~isnaobj(values)]

Expand All @@ -1219,9 +1229,6 @@ def infer_dtype(value: object, skipna: bool=False) -> str:
if values.dtype != np.object_:
values = values.astype('O')

# make contiguous
values = values.ravel()

n = len(values)
if n == 0:
return 'empty'
Expand Down
14 changes: 9 additions & 5 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def _ensure_arraylike(values):
ensure that we are arraylike if not already
"""
if not is_array_like(values):
inferred = lib.infer_dtype(values)
inferred = lib.infer_dtype(values, skipna=True)
if inferred in ['mixed', 'string', 'unicode']:
if isinstance(values, tuple):
values = list(values)
Expand Down Expand Up @@ -202,8 +202,10 @@ def _get_hashtable_algo(values):

if ndtype == 'object':

# its cheaper to use a String Hash Table than Object
if lib.infer_dtype(values) in ['string']:
# it's cheaper to use a String Hash Table than Object; we infer
# including nulls because that is the only difference between
# StringHashTable and ObjectHashtable
if lib.infer_dtype(values, skipna=False) in ['string']:
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
ndtype = 'string'
else:
ndtype = 'object'
Expand All @@ -220,8 +222,10 @@ def _get_data_algo(values, func_map):
values, dtype, ndtype = _ensure_data(values)
if ndtype == 'object':

# its cheaper to use a String Hash Table than Object
if lib.infer_dtype(values) in ['string']:
# it's cheaper to use a String Hash Table than Object; we infer
# including nulls because that is the only difference between
# StringHashTable and ObjectHashtable
if lib.infer_dtype(values, skipna=False) in ['string']:
ndtype = 'string'

f = func_map.get(ndtype, func_map['object'])
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1504,7 +1504,7 @@ def sequence_to_dt64ns(data, dtype=None, copy=False,
# TODO: We do not have tests specific to string-dtypes,
# also complex or categorical or other extension
copy = False
if lib.infer_dtype(data) == 'integer':
if lib.infer_dtype(data, skipna=True) == 'integer':
data = data.astype(np.int64)
else:
# data comes back here as either i8 to denote UTC timestamps
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,8 @@ def coerce_to_array(values, dtype, mask=None, copy=False):

values = np.array(values, copy=copy)
if is_object_dtype(values):
inferred_type = lib.infer_dtype(values)
if inferred_type is 'mixed' and isna(values).all():
inferred_type = lib.infer_dtype(values, skipna=True)
if inferred_type == 'empty':
values = np.empty(len(values))
values.fill(np.nan)
elif inferred_type not in ['floating', 'integer',
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,7 +487,7 @@ def __floordiv__(self, other):
elif is_object_dtype(other):
result = [self[n] // other[n] for n in range(len(self))]
result = np.array(result)
if lib.infer_dtype(result) == 'timedelta':
if lib.infer_dtype(result, skipna=True) == 'timedelta':
result, _ = sequence_to_td64ns(result)
return type(self)(result)
return result
Expand Down
12 changes: 7 additions & 5 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ def trans(x):

if isinstance(dtype, string_types):
if dtype == 'infer':
inferred_type = lib.infer_dtype(ensure_object(result.ravel()))
inferred_type = lib.infer_dtype(ensure_object(result.ravel()),
skipna=True)
if inferred_type == 'boolean':
dtype = 'bool'
elif inferred_type == 'integer':
Expand Down Expand Up @@ -458,7 +459,7 @@ def infer_dtype_from_array(arr, pandas_dtype=False):
return arr.dtype, np.asarray(arr)

# don't force numpy coerce with nan's
inferred = lib.infer_dtype(arr)
inferred = lib.infer_dtype(arr, skipna=True)
if inferred in ['string', 'bytes', 'unicode',
'mixed', 'mixed-integer']:
return (np.object_, arr)
Expand Down Expand Up @@ -937,10 +938,11 @@ def try_timedelta(v):

# We have at least a NaT and a string
# try timedelta first to avoid spurious datetime conversions
# e.g. '00:00:01' is a timedelta but
# technically is also a datetime
# e.g. '00:00:01' is a timedelta but technically is also a datetime
value = try_timedelta(v)
if lib.infer_dtype(value) in ['mixed']:
if lib.infer_dtype(value, skipna=False) in ['mixed']:
jreback marked this conversation as resolved.
Show resolved Hide resolved
# cannot skip missing values, as NaT implies that the string
# is actually a datetime
value = try_datetime(v)

return value
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/dtypes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -704,7 +704,8 @@ def is_datetime_arraylike(arr):
if isinstance(arr, ABCDatetimeIndex):
return True
elif isinstance(arr, (np.ndarray, ABCSeries)):
return arr.dtype == object and lib.infer_dtype(arr) == 'datetime'
return (is_object_dtype(arr.dtype)
and lib.infer_dtype(arr, skipna=True) == 'datetime')
return getattr(arr, 'inferred_type', None) == 'datetime'


Expand Down
2 changes: 1 addition & 1 deletion pandas/core/dtypes/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,7 @@ def _infer_fill_value(val):
if is_datetimelike(val):
return np.array('NaT', dtype=val.dtype)
elif is_object_dtype(val.dtype):
dtype = lib.infer_dtype(ensure_object(val))
dtype = lib.infer_dtype(ensure_object(val), skipna=True)
if dtype in ['datetime', 'datetime64']:
return np.array('NaT', dtype=_NS_DTYPE)
elif dtype in ['timedelta', 'timedelta64']:
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
# should not be coerced
# GH 11836
if is_integer_dtype(dtype):
inferred = lib.infer_dtype(data)
inferred = lib.infer_dtype(data, skipna=True)
if inferred == 'integer':
data = maybe_cast_to_integer_array(data, dtype,
copy=copy)
Expand Down Expand Up @@ -374,7 +374,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
else:
data = data.astype(dtype)
elif is_float_dtype(dtype):
inferred = lib.infer_dtype(data)
inferred = lib.infer_dtype(data, skipna=True)
if inferred == 'string':
pass
else:
Expand Down Expand Up @@ -412,7 +412,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
subarr = subarr.copy()

if dtype is None:
inferred = lib.infer_dtype(subarr)
inferred = lib.infer_dtype(subarr, skipna=False)
jreback marked this conversation as resolved.
Show resolved Hide resolved
if inferred == 'integer':
try:
return cls._try_convert_to_int_index(
Expand Down Expand Up @@ -1734,7 +1734,7 @@ def inferred_type(self):
"""
Return a string of the type inferred from the values.
"""
return lib.infer_dtype(self)
return lib.infer_dtype(self, skipna=True)

@cache_readonly
def is_all_dates(self):
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2317,7 +2317,8 @@ def _partial_tup_index(self, tup, side='left'):
section = labs[start:end]

if lab not in lev:
if not lev.is_type_compatible(lib.infer_dtype([lab])):
if not lev.is_type_compatible(lib.infer_dtype([lab],
skipna=True)):
raise TypeError('Level type mismatch: %s' % lab)

# short circuit
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -658,7 +658,7 @@ def sanitize_array(data, index, dtype=None, copy=False,
subarr = np.array(data, dtype=object, copy=copy)

if is_object_dtype(subarr.dtype) and dtype != 'object':
inferred = lib.infer_dtype(subarr)
inferred = lib.infer_dtype(subarr, skipna=True)
if inferred == 'period':
try:
subarr = period_array(subarr)
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -942,7 +942,8 @@ def _maybe_coerce_merge_keys(self):
'representation', UserWarning)

# let's infer and see if we are ok
elif lib.infer_dtype(lk) == lib.infer_dtype(rk):
elif (lib.infer_dtype(lk, skipna=True)
== lib.infer_dtype(rk, skipna=True)):
pass

# Check if we are trying to merge on obviously
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/reshape/tile.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ def _convert_bin_to_numeric_type(bins, dtype):
------
ValueError if bins are not of a compat dtype to dtype
"""
bins_dtype = infer_dtype(bins)
bins_dtype = infer_dtype(bins, skipna=True)
if is_timedelta64_dtype(dtype):
if bins_dtype in ['timedelta', 'timedelta64']:
bins = to_timedelta(bins).view(np.int64)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -870,7 +870,7 @@ def _get_with(self, key):
if isinstance(key, Index):
key_type = key.inferred_type
else:
key_type = lib.infer_dtype(key)
key_type = lib.infer_dtype(key, skipna=True)

if key_type == 'integer':
if self.index.is_integer() or self.index.is_floating():
Expand Down Expand Up @@ -1007,7 +1007,7 @@ def _set_with(self, key, value):
if isinstance(key, Index):
key_type = key.inferred_type
else:
key_type = lib.infer_dtype(key)
key_type = lib.infer_dtype(key, skipna=True)

if key_type == 'integer':
if self.index.inferred_type == 'integer':
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,7 +454,7 @@ def sort_mixed(values):
return np.concatenate([nums, np.asarray(strs, dtype=object)])

sorter = None
if PY3 and lib.infer_dtype(values) == 'mixed-integer':
if PY3 and lib.infer_dtype(values, skipna=True) == 'mixed-integer':
# unorderable in py3 if mixed str/int
ordered = sort_mixed(values)
else:
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1299,7 +1299,7 @@ def _validate_usecols_arg(usecols):
elif not is_list_like(usecols):
raise ValueError(msg)
else:
usecols_dtype = lib.infer_dtype(usecols)
usecols_dtype = lib.infer_dtype(usecols, skipna=True)
if usecols_dtype not in ('empty', 'integer',
'string', 'unicode'):
raise ValueError(msg)
Expand Down
12 changes: 6 additions & 6 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -1952,7 +1952,7 @@ def set_atom(self, block, block_items, existing_col, min_itemsize,
return self.set_atom_complex(block)

dtype = block.dtype.name
inferred_type = lib.infer_dtype(block.values)
inferred_type = lib.infer_dtype(block.values, skipna=True)

if inferred_type == 'date':
raise TypeError(
Expand Down Expand Up @@ -1998,15 +1998,15 @@ def set_atom_string(self, block, block_items, existing_col, min_itemsize,
data = block.values

# see if we have a valid string type
inferred_type = lib.infer_dtype(data.ravel())
inferred_type = lib.infer_dtype(data.ravel(), skipna=True)
if inferred_type != 'string':

# we cannot serialize this data, so report an exception on a column
# by column basis
for i, item in enumerate(block_items):

col = block.iget(i)
inferred_type = lib.infer_dtype(col.ravel())
inferred_type = lib.infer_dtype(col.ravel(), skipna=True)
if inferred_type != 'string':
raise TypeError(
"Cannot serialize the column [%s] because\n"
Expand Down Expand Up @@ -2743,7 +2743,7 @@ def write_array(self, key, value, items=None):

# infer the type, warn if we have a non-string type here (for
# performance)
inferred_type = lib.infer_dtype(value.ravel())
inferred_type = lib.infer_dtype(value.ravel(), skipna=True)
if empty_array:
pass
elif inferred_type == 'string':
Expand Down Expand Up @@ -4510,7 +4510,7 @@ def _convert_index(index, encoding=None, errors='strict', format_type=None):
if isinstance(index, MultiIndex):
raise TypeError('MultiIndex not supported here!')

inferred_type = lib.infer_dtype(index)
inferred_type = lib.infer_dtype(index, skipna=True)

values = np.asarray(index)

Expand Down Expand Up @@ -4743,7 +4743,7 @@ def __init__(self, table, where=None, start=None, stop=None):

# see if we have a passed coordinate like
try:
inferred = lib.infer_dtype(where)
inferred = lib.infer_dtype(where, skipna=True)
if inferred == 'integer' or inferred == 'boolean':
where = np.asarray(where)
if where.dtype == np.bool_:
Expand Down
Loading