From dde47177378a07bfd31b53299ab9102179ec4434 Mon Sep 17 00:00:00 2001
From: Phillip Cloud <cpcloud@gmail.com>
Date: Tue, 25 Jul 2017 13:27:45 -0400
Subject: [PATCH] ENH: Add skipna parameter to infer_dtype (#17066)

Currently defaults to False for backwards compatibility.  Will default to True in the future.

Closes gh-17059.
---
 doc/source/whatsnew/v0.21.0.txt       |   2 +
 pandas/_libs/src/inference.pyx        | 548 +++++++++++++++-----------
 pandas/tests/dtypes/test_inference.py |  33 +-
 3 files changed, 358 insertions(+), 225 deletions(-)

diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
index 89b8a53e396d19..35169f955151c3 100644
--- a/doc/source/whatsnew/v0.21.0.txt
+++ b/doc/source/whatsnew/v0.21.0.txt
@@ -24,6 +24,8 @@ New features
   <https://www.python.org/dev/peps/pep-0519/>`_ on most readers and writers (:issue:`13823`)
 - Added ``__fspath__`` method to :class:`~pandas.HDFStore`, :class:`~pandas.ExcelFile`,
   and :class:`~pandas.ExcelWriter` to work properly with the file system path protocol (:issue:`13823`)
+- Added ``skipna`` parameter to :func:`~pandas.api.types.infer_dtype` to
+  support type inference in the presence of missing values (:issue:`17059`).
 
 
 .. _whatsnew_0210.enhancements.infer_objects:
diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx
index 38e95fe6ee6524..6b5a8f20f00671 100644
--- a/pandas/_libs/src/inference.pyx
+++ b/pandas/_libs/src/inference.pyx
@@ -1,6 +1,7 @@
 import sys
 from decimal import Decimal
 cimport util
+cimport cython
 from tslib import NaT, get_timezone
 from datetime import datetime, timedelta
 iNaT = util.get_nat()
@@ -222,7 +223,7 @@ cdef _try_infer_map(v):
     return None
 
 
-def infer_dtype(object value):
+def infer_dtype(object value, bint skipna=False):
     """
     Effeciently infer the type of a passed val, or list-like
     array of values. Return a string describing the type.
@@ -230,6 +231,11 @@ def infer_dtype(object value):
     Parameters
     ----------
     value : scalar, list, ndarray, or pandas type
+    skipna : bool, default False
+        Ignore NaN values when inferring the type. The default of ``False``
+        will be deprecated in a later version of pandas.
+
+        .. versionadded:: 0.21.0
 
     Returns
     -------
@@ -272,6 +278,12 @@ def infer_dtype(object value):
     >>> infer_dtype(['foo', 'bar'])
     'string'
 
+    >>> infer_dtype(['a', np.nan, 'b'], skipna=True)
+    'string'
+
+    >>> infer_dtype(['a', np.nan, 'b'], skipna=False)
+    'mixed'
+
     >>> infer_dtype([b'foo', b'bar'])
     'bytes'
 
@@ -310,13 +322,13 @@ def infer_dtype(object value):
 
     >>> infer_dtype(pd.Series(list('aabc')).astype('category'))
     'categorical'
-
     """
     cdef:
         Py_ssize_t i, n
         object val
         ndarray values
-        bint seen_pdnat = False, seen_val = False
+        bint seen_pdnat = False
+        bint seen_val = False
 
     if isinstance(value, np.ndarray):
         values = value
@@ -356,7 +368,7 @@ def infer_dtype(object value):
     values = values.ravel()
 
     # try to use a valid value
-    for i from 0 <= i < n:
+    for i in range(n):
         val = util.get_value_1d(values, i)
 
         # do not use is_nul_datetimelike to keep
@@ -403,11 +415,11 @@ def infer_dtype(object value):
             return 'datetime'
 
     elif is_date(val):
-        if is_date_array(values):
+        if is_date_array(values, skipna=skipna):
             return 'date'
 
     elif is_time(val):
-        if is_time_array(values):
+        if is_time_array(values, skipna=skipna):
             return 'time'
 
     elif is_decimal(val):
@@ -420,19 +432,19 @@ def infer_dtype(object value):
             return 'mixed-integer-float'
 
     elif util.is_bool_object(val):
-        if is_bool_array(values):
+        if is_bool_array(values, skipna=skipna):
             return 'boolean'
 
     elif PyString_Check(val):
-        if is_string_array(values):
+        if is_string_array(values, skipna=skipna):
             return 'string'
 
     elif PyUnicode_Check(val):
-        if is_unicode_array(values):
+        if is_unicode_array(values, skipna=skipna):
             return 'unicode'
 
     elif PyBytes_Check(val):
-        if is_bytes_array(values):
+        if is_bytes_array(values, skipna=skipna):
             return 'bytes'
 
     elif is_period(val):
@@ -593,190 +605,284 @@ cdef inline bint is_timedelta(object o):
     return PyDelta_Check(o) or util.is_timedelta64_object(o)
 
 
-cpdef bint is_bool_array(ndarray values):
-    cdef:
-        Py_ssize_t i, n = len(values)
-        ndarray[object] objbuf
+cdef class Validator:
 
-    if issubclass(values.dtype.type, np.bool_):
-        return True
-    elif values.dtype == np.object_:
-        objbuf = values
+    cdef:
+        Py_ssize_t n
+        np.dtype dtype
+        bint skipna
+
+    def __cinit__(
+        self,
+        Py_ssize_t n,
+        np.dtype dtype=np.dtype(np.object_),
+        bint skipna=False
+    ):
+        self.n = n
+        self.dtype = dtype
+        self.skipna = skipna
+
+    cdef bint validate(self, object[:] values) except -1:
+        if not self.n:
+            return False
 
-        if n == 0:
+        if self.is_array_typed():
+            return True
+        elif self.dtype.type_num == NPY_OBJECT:
+            if self.skipna:
+                return self._validate_skipna(values)
+            else:
+                return self._validate(values)
+        else:
             return False
 
+    @cython.wraparound(False)
+    @cython.boundscheck(False)
+    cdef bint _validate(self, object[:] values) except -1:
+        cdef:
+            Py_ssize_t i
+            Py_ssize_t n = self.n
+
         for i in range(n):
-            if not util.is_bool_object(objbuf[i]):
+            if not self.is_valid(values[i]):
                 return False
-        return True
-    else:
+
+        return self.finalize_validate()
+
+    @cython.wraparound(False)
+    @cython.boundscheck(False)
+    cdef bint _validate_skipna(self, object[:] values) except -1:
+        cdef:
+            Py_ssize_t i
+            Py_ssize_t n = self.n
+
+        for i in range(n):
+            if not self.is_valid_skipna(values[i]):
+                return False
+
+        return self.finalize_validate_skipna()
+
+    cdef bint is_valid(self, object value) except -1:
+        return self.is_value_typed(value)
+
+    cdef bint is_valid_skipna(self, object value) except -1:
+        return self.is_valid(value) or self.is_valid_null(value)
+
+    cdef bint is_value_typed(self, object value) except -1:
+        raise NotImplementedError(
+            '{} child class must define is_value_typed'.format(
+                type(self).__name__
+            )
+        )
+
+    cdef bint is_valid_null(self, object value) except -1:
+        return util._checknull(value)
+
+    cdef bint is_array_typed(self) except -1:
         return False
 
+    cdef inline bint finalize_validate(self):
+        return True
+
+    cdef bint finalize_validate_skipna(self):
+        # TODO(phillipc): Remove the existing validate methods and replace them
+        # with the skipna versions upon full deprecation of skipna=False
+        return True
+
+
+cdef class BoolValidator(Validator):
+
+    cdef inline bint is_value_typed(self, object value) except -1:
+        return util.is_bool_object(value)
+
+    cdef inline bint is_array_typed(self) except -1:
+        return issubclass(self.dtype.type, np.bool_)
+
+
+cpdef bint is_bool_array(ndarray values, bint skipna=False):
+    cdef:
+        BoolValidator validator = BoolValidator(
+            len(values),
+            values.dtype,
+            skipna=skipna
+        )
+    return validator.validate(values)
+
+
+cdef class IntegerValidator(Validator):
+
+    cdef inline bint is_value_typed(self, object value) except -1:
+        return util.is_integer_object(value)
+
+    cdef inline bint is_array_typed(self) except -1:
+        return issubclass(self.dtype.type, np.integer)
+
 
 cpdef bint is_integer_array(ndarray values):
     cdef:
-        Py_ssize_t i, n = len(values)
-        ndarray[object] objbuf
+        IntegerValidator validator = IntegerValidator(
+            len(values),
+            values.dtype,
+        )
+    return validator.validate(values)
 
-    if issubclass(values.dtype.type, np.integer):
-        return True
-    elif values.dtype == np.object_:
-        objbuf = values
 
-        if n == 0:
-            return False
+cdef class IntegerFloatValidator(Validator):
 
-        for i in range(n):
-            if not util.is_integer_object(objbuf[i]):
-                return False
-        return True
-    else:
-        return False
+    cdef inline bint is_value_typed(self, object value) except -1:
+        return util.is_integer_object(value) or util.is_float_object(value)
+
+    cdef inline bint is_array_typed(self) except -1:
+        return issubclass(self.dtype.type, np.integer)
 
 
 cpdef bint is_integer_float_array(ndarray values):
     cdef:
-        Py_ssize_t i, n = len(values)
-        ndarray[object] objbuf
+        IntegerFloatValidator validator = IntegerFloatValidator(
+            len(values),
+            values.dtype,
+        )
+    return validator.validate(values)
 
-    if issubclass(values.dtype.type, np.integer):
-        return True
-    elif values.dtype == np.object_:
-        objbuf = values
 
-        if n == 0:
-            return False
+cdef class FloatValidator(Validator):
 
-        for i in range(n):
-            if not (util.is_integer_object(objbuf[i]) or
-                    util.is_float_object(objbuf[i])):
+    cdef inline bint is_value_typed(self, object value) except -1:
+        return util.is_float_object(value)
 
-                return False
-        return True
-    else:
-        return False
+    cdef inline bint is_array_typed(self) except -1:
+        return issubclass(self.dtype.type, np.floating)
 
 
 cpdef bint is_float_array(ndarray values):
-    cdef:
-        Py_ssize_t i, n = len(values)
-        ndarray[object] objbuf
+    cdef FloatValidator validator = FloatValidator(len(values), values.dtype)
+    return validator.validate(values)
 
-    if issubclass(values.dtype.type, np.floating):
-        return True
-    elif values.dtype == np.object_:
-        objbuf = values
 
-        if n == 0:
-            return False
+cdef class StringValidator(Validator):
 
-        for i in range(n):
-            if not util.is_float_object(objbuf[i]):
-                return False
-        return True
-    else:
-        return False
+    cdef inline bint is_value_typed(self, object value) except -1:
+        return PyString_Check(value)
+
+    cdef inline bint is_array_typed(self) except -1:
+        return issubclass(self.dtype.type, np.str_)
 
 
-cpdef bint is_string_array(ndarray values):
+cpdef bint is_string_array(ndarray values, bint skipna=False):
     cdef:
-        Py_ssize_t i, n = len(values)
-        ndarray[object] objbuf
+        StringValidator validator = StringValidator(
+            len(values),
+            values.dtype,
+            skipna=skipna,
+        )
+    return validator.validate(values)
 
-    if ((PY2 and issubclass(values.dtype.type, np.string_)) or
-        not PY2 and issubclass(values.dtype.type, np.unicode_)):
-        return True
-    elif values.dtype == np.object_:
-        objbuf = values
 
-        if n == 0:
-            return False
+cdef class UnicodeValidator(Validator):
 
-        for i in range(n):
-            if not PyString_Check(objbuf[i]):
-                return False
-        return True
-    else:
-        return False
+    cdef inline bint is_value_typed(self, object value) except -1:
+        return PyUnicode_Check(value)
+
+    cdef inline bint is_array_typed(self) except -1:
+        return issubclass(self.dtype.type, np.unicode_)
 
 
-cpdef bint is_unicode_array(ndarray values):
+cpdef bint is_unicode_array(ndarray values, bint skipna=False):
     cdef:
-        Py_ssize_t i, n = len(values)
-        ndarray[object] objbuf
+        UnicodeValidator validator = UnicodeValidator(
+            len(values),
+            values.dtype,
+            skipna=skipna,
+        )
+    return validator.validate(values)
 
-    if issubclass(values.dtype.type, np.unicode_):
-        return True
-    elif values.dtype == np.object_:
-        objbuf = values
 
-        if n == 0:
-            return False
+cdef class BytesValidator(Validator):
 
-        for i in range(n):
-            if not PyUnicode_Check(objbuf[i]):
-                return False
-        return True
-    else:
-        return False
+    cdef inline bint is_value_typed(self, object value) except -1:
+        return PyBytes_Check(value)
+
+    cdef inline bint is_array_typed(self) except -1:
+        return issubclass(self.dtype.type, np.bytes_)
 
 
-cpdef bint is_bytes_array(ndarray values):
+cpdef bint is_bytes_array(ndarray values, bint skipna=False):
     cdef:
-        Py_ssize_t i, n = len(values)
-        ndarray[object] objbuf
+        BytesValidator validator = BytesValidator(
+            len(values),
+            values.dtype,
+            skipna=skipna
+        )
+    return validator.validate(values)
 
-    if issubclass(values.dtype.type, np.bytes_):
-        return True
-    elif values.dtype == np.object_:
-        objbuf = values
 
-        if n == 0:
-            return False
+cdef class TemporalValidator(Validator):
+
+    cdef Py_ssize_t generic_null_count
+
+    def __cinit__(
+        self,
+        Py_ssize_t n,
+        np.dtype dtype=np.dtype(np.object_),
+        bint skipna=False
+    ):
+        self.n = n
+        self.dtype = dtype
+        self.skipna = skipna
+        self.generic_null_count = 0
+
+    cdef inline bint is_valid(self, object value) except -1:
+        return self.is_value_typed(value) or self.is_valid_null(value)
+
+    cdef bint is_valid_null(self, object value) except -1:
+        raise NotImplementedError(
+            '{} child class must define is_valid_null'.format(
+                type(self).__name__
+            )
+        )
+
+    cdef inline bint is_valid_skipna(self, object value) except -1:
+        cdef:
+            bint is_typed_null = self.is_valid_null(value)
+            bint is_generic_null = util._checknull(value)
+        self.generic_null_count += is_typed_null and is_generic_null
+        return self.is_value_typed(value) or is_typed_null or is_generic_null
+
+    cdef inline bint finalize_validate_skipna(self):
+        return self.generic_null_count != self.n
 
-        for i in range(n):
-            if not PyBytes_Check(objbuf[i]):
-                return False
-        return True
-    else:
-        return False
+
+cdef class DatetimeValidator(TemporalValidator):
+
+    cdef bint is_value_typed(self, object value) except -1:
+        return is_datetime(value)
+
+    cdef inline bint is_valid_null(self, object value) except -1:
+        return is_null_datetime64(value)
 
 
 cpdef bint is_datetime_array(ndarray[object] values):
-    cdef Py_ssize_t i, null_count = 0, n = len(values)
-    cdef object v
-    if n == 0:
-        return False
+    cdef:
+        DatetimeValidator validator = DatetimeValidator(
+            len(values),
+            skipna=True,
+        )
+    return validator.validate(values)
 
-    # return False for all nulls
-    for i in range(n):
-        v = values[i]
-        if is_null_datetime64(v):
-            # we are a regular null
-            if util._checknull(v):
-                null_count += 1
-        elif not is_datetime(v):
-            return False
-    return null_count != n
 
+cdef class Datetime64Validator(DatetimeValidator):
 
-cpdef bint is_datetime64_array(ndarray values):
-    cdef Py_ssize_t i, null_count = 0, n = len(values)
-    cdef object v
-    if n == 0:
-        return False
+    cdef inline bint is_value_typed(self, object value) except -1:
+        return util.is_datetime64_object(value)
 
-    # return False for all nulls
-    for i in range(n):
-        v = values[i]
-        if is_null_datetime64(v):
-            # we are a regular null
-            if util._checknull(v):
-                null_count += 1
-        elif not util.is_datetime64_object(v):
-            return False
-    return null_count != n
+
+cpdef bint is_datetime64_array(ndarray values):
+    cdef:
+        Datetime64Validator validator = Datetime64Validator(
+            len(values),
+            skipna=True,
+        )
+    return validator.validate(values)
 
 
 cpdef bint is_datetime_with_singletz_array(ndarray[object] values):
@@ -807,108 +913,104 @@ cpdef bint is_datetime_with_singletz_array(ndarray[object] values):
     return True
 
 
+cdef class TimedeltaValidator(TemporalValidator):
+
+    cdef bint is_value_typed(self, object value) except -1:
+        return PyDelta_Check(value)
+
+    cdef inline bint is_valid_null(self, object value) except -1:
+        return is_null_timedelta64(value)
+
+
 cpdef bint is_timedelta_array(ndarray values):
-    cdef Py_ssize_t i, null_count = 0, n = len(values)
-    cdef object v
-    if n == 0:
-        return False
-    for i in range(n):
-        v = values[i]
-        if is_null_timedelta64(v):
-            # we are a regular null
-            if util._checknull(v):
-                null_count += 1
-        elif not PyDelta_Check(v):
-            return False
-    return null_count != n
+    cdef:
+        TimedeltaValidator validator = TimedeltaValidator(
+            len(values),
+            skipna=True,
+        )
+    return validator.validate(values)
+
+
+cdef class Timedelta64Validator(TimedeltaValidator):
+
+    cdef inline bint is_value_typed(self, object value) except -1:
+        return util.is_timedelta64_object(value)
 
 
 cpdef bint is_timedelta64_array(ndarray values):
-    cdef Py_ssize_t i, null_count = 0, n = len(values)
-    cdef object v
-    if n == 0:
-        return False
-    for i in range(n):
-        v = values[i]
-        if is_null_timedelta64(v):
-            # we are a regular null
-            if util._checknull(v):
-                null_count += 1
-        elif not util.is_timedelta64_object(v):
-            return False
-    return null_count != n
+    cdef:
+        Timedelta64Validator validator = Timedelta64Validator(
+            len(values),
+            skipna=True,
+        )
+    return validator.validate(values)
+
+
+cdef class AnyTimedeltaValidator(TimedeltaValidator):
+
+    cdef inline bint is_value_typed(self, object value) except -1:
+        return is_timedelta(value)
 
 
 cpdef bint is_timedelta_or_timedelta64_array(ndarray values):
     """ infer with timedeltas and/or nat/none """
-    cdef Py_ssize_t i, null_count = 0, n = len(values)
-    cdef object v
-    if n == 0:
-        return False
-    for i in range(n):
-        v = values[i]
-        if is_null_timedelta64(v):
-            # we are a regular null
-            if util._checknull(v):
-                null_count += 1
-        elif not is_timedelta(v):
-            return False
-    return null_count != n
+    cdef:
+        AnyTimedeltaValidator validator = AnyTimedeltaValidator(
+            len(values),
+            skipna=True,
+        )
+    return validator.validate(values)
 
 
-cpdef bint is_date_array(ndarray[object] values):
-    cdef Py_ssize_t i, n = len(values)
-    if n == 0:
-        return False
-    for i in range(n):
-        if not is_date(values[i]):
-            return False
-    return True
+cdef class DateValidator(Validator):
 
+    cdef inline bint is_value_typed(self, object value) except -1:
+        return is_date(value)
+
+
+cpdef bint is_date_array(ndarray[object] values, bint skipna=False):
+    cdef DateValidator validator = DateValidator(len(values), skipna=skipna)
+    return validator.validate(values)
 
-cpdef bint is_time_array(ndarray[object] values):
-    cdef Py_ssize_t i, n = len(values)
-    if n == 0:
-        return False
-    for i in range(n):
-        if not is_time(values[i]):
-            return False
-    return True
+
+cdef class TimeValidator(Validator):
+
+    cdef inline bint is_value_typed(self, object value) except -1:
+        return is_time(value)
+
+
+cpdef bint is_time_array(ndarray[object] values, bint skipna=False):
+    cdef TimeValidator validator = TimeValidator(len(values), skipna=skipna)
+    return validator.validate(values)
+
+
+cdef class PeriodValidator(TemporalValidator):
+
+    cdef inline bint is_value_typed(self, object value) except -1:
+        return is_period(value)
+
+    cdef inline bint is_valid_null(self, object value) except -1:
+        return is_null_period(value)
 
 
 cpdef bint is_period_array(ndarray[object] values):
-    cdef Py_ssize_t i, null_count = 0, n = len(values)
-    cdef object v
-    if n == 0:
-        return False
+    cdef PeriodValidator validator = PeriodValidator(len(values), skipna=True)
+    return validator.validate(values)
 
-    # return False for all nulls
-    for i in range(n):
-        v = values[i]
-        if is_null_period(v):
-            # we are a regular null
-            if util._checknull(v):
-                null_count += 1
-        elif not is_period(v):
-            return False
-    return null_count != n
+
+cdef class IntervalValidator(Validator):
+
+    cdef inline bint is_value_typed(self, object value) except -1:
+        return is_interval(value)
 
 
 cpdef bint is_interval_array(ndarray[object] values):
     cdef:
-        Py_ssize_t i, n = len(values), null_count = 0
-        object v
-
-    if n == 0:
-        return False
-    for i in range(n):
-        v = values[i]
-        if util._checknull(v):
-            null_count += 1
-            continue
-        if not is_interval(v):
-            return False
-    return null_count != n
+        IntervalValidator validator = IntervalValidator(
+            len(values),
+            skipna=True,
+        )
+    return validator.validate(values)
 
 
 cdef extern from "parse_helper.h":
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
index d26ea047bb41f3..dbde7ae5081d4a 100644
--- a/pandas/tests/dtypes/test_inference.py
+++ b/pandas/tests/dtypes/test_inference.py
@@ -239,6 +239,9 @@ def test_infer_dtype_bytes(self):
         arr = arr.astype(object)
         assert lib.infer_dtype(arr) == compare
 
+        # object array of bytes with missing values
+        assert lib.infer_dtype([b'a', np.nan, b'c'], skipna=True) == compare
+
     def test_isinf_scalar(self):
         # GH 11352
         assert lib.isposinf_scalar(float('inf'))
@@ -444,6 +447,10 @@ def test_bools(self):
         result = lib.infer_dtype(arr)
         assert result == 'boolean'
 
+        arr = np.array([True, np.nan, False], dtype='O')
+        result = lib.infer_dtype(arr, skipna=True)
+        assert result == 'boolean'
+
     def test_floats(self):
         arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O')
         result = lib.infer_dtype(arr)
@@ -472,11 +479,26 @@ def test_decimals(self):
         result = lib.infer_dtype(arr)
         assert result == 'mixed'
 
+        arr = np.array([Decimal(1), Decimal('NaN'), Decimal(3)])
+        result = lib.infer_dtype(arr)
+        assert result == 'decimal'
+
+        arr = np.array([Decimal(1), np.nan, Decimal(3)], dtype='O')
+        result = lib.infer_dtype(arr)
+        assert result == 'decimal'
+
     def test_string(self):
         pass
 
     def test_unicode(self):
-        pass
+        arr = [u'a', np.nan, u'c']
+        result = lib.infer_dtype(arr)
+        assert result == 'mixed'
+
+        arr = [u'a', np.nan, u'c']
+        result = lib.infer_dtype(arr, skipna=True)
+        expected = 'unicode' if PY2 else 'string'
+        assert result == expected
 
     def test_datetime(self):
 
@@ -714,10 +736,17 @@ def test_is_datetimelike_array_all_nan_nat_like(self):
 
     def test_date(self):
 
-        dates = [date(2012, 1, x) for x in range(1, 20)]
+        dates = [date(2012, 1, day) for day in range(1, 20)]
         index = Index(dates)
         assert index.inferred_type == 'date'
 
+        dates = [date(2012, 1, day) for day in range(1, 20)] + [np.nan]
+        result = lib.infer_dtype(dates)
+        assert result == 'mixed'
+
+        result = lib.infer_dtype(dates, skipna=True)
+        assert result == 'date'
+
     def test_to_object_array_tuples(self):
         r = (5, 6)
         values = [r]