From 3e3256bb6038111812b4b28f6b3b049214d83d2d Mon Sep 17 00:00:00 2001
From: alimcmaster1 <alimcmaster1@gmail.com>
Date: Wed, 3 Oct 2018 12:23:22 +0100
Subject: [PATCH] Allow passing a mask to NanOps (#22865)

---
 pandas/core/nanops.py       | 404 ++++++++++++++++++++++++++++++++----
 pandas/tests/test_nanops.py |  36 +++-
 2 files changed, 391 insertions(+), 49 deletions(-)

diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
index 232d030da7f1e..2884bc1a19491 100644
--- a/pandas/core/nanops.py
+++ b/pandas/core/nanops.py
@@ -1,12 +1,16 @@
-import itertools
 import functools
+import itertools
 import operator
 import warnings
 from distutils.version import LooseVersion
 
 import numpy as np
+
+import pandas.core.common as com
 from pandas import compat
 from pandas._libs import tslibs, lib
+from pandas.core.config import get_option
+from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask
 from pandas.core.dtypes.common import (
     _get_dtype,
     is_float, is_scalar,
@@ -17,10 +21,7 @@
     is_datetime64_dtype, is_timedelta64_dtype,
     is_datetime_or_timedelta_dtype,
     is_int_or_datetime_dtype, is_any_int_dtype)
-from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask
 from pandas.core.dtypes.missing import isna, notna, na_value_for_dtype
-from pandas.core.config import get_option
-import pandas.core.common as com
 
 _BOTTLENECK_INSTALLED = False
 _MIN_BOTTLENECK_VERSION = '1.0.0'
@@ -200,16 +201,18 @@ def _get_fill_value(dtype, fill_value=None, fill_value_typ=None):
 
 
 def _get_values(values, skipna, fill_value=None, fill_value_typ=None,
-                isfinite=False, copy=True):
+                isfinite=False, copy=True, mask=None):
     """ utility to get the values view, mask, dtype
     if necessary copy and mask using the specified fill_value
     copy = True will force the copy
     """
     values = com.values_from_object(values)
-    if isfinite:
-        mask = _isfinite(values)
-    else:
-        mask = isna(values)
+
+    if mask is None:
+        if isfinite:
+            mask = _isfinite(values)
+        else:
+            mask = isna(values)
 
     dtype = values.dtype
     dtype_ok = _na_ok_dtype(dtype)
@@ -315,19 +318,98 @@ def _na_for_min_count(values, axis):
         return result
 
 
-def nanany(values, axis=None, skipna=True):
-    values, mask, dtype, _ = _get_values(values, skipna, False, copy=skipna)
+def nanany(values, axis=None, skipna=True, mask=None):
+    """
+    Check if any elements along an axis evaluate to True.
+
+    Parameters
+    ----------
+    values : ndarray
+    axis : int, optional
+    skipna : bool, default True
+    mask : ndarray[bool], optional
+        nan-mask if known
+
+    Returns
+    -------
+    result : bool
+
+    Examples
+    --------
+    >>> import pandas.core.nanops as nanops
+    >>> s = pd.Series([1, 2])
+    >>> nanops.nanany(s)
+    True
+
+    >>> import pandas.core.nanops as nanops
+    >>> s = pd.Series([np.nan])
+    >>> nanops.nanany(s)
+    False
+    """
+    values, mask, dtype, _ = _get_values(values, skipna, False, copy=skipna,
+                                         mask=mask)
     return values.any(axis)
 
 
-def nanall(values, axis=None, skipna=True):
-    values, mask, dtype, _ = _get_values(values, skipna, True, copy=skipna)
+def nanall(values, axis=None, skipna=True, mask=None):
+    """
+    Check if all elements along an axis evaluate to True.
+
+    Parameters
+    ----------
+    values : ndarray
+    axis: int, optional
+    skipna : bool, default True
+    mask : ndarray[bool], optional
+        nan-mask if known
+
+    Returns
+    -------
+    result : bool
+
+    Examples
+    --------
+    >>> import pandas.core.nanops as nanops
+    >>> s = pd.Series([1, 2, np.nan])
+    >>> nanops.nanall(s)
+    True
+
+    >>> import pandas.core.nanops as nanops
+    >>> s = pd.Series([1, 0])
+    >>> nanops.nanall(s)
+    False
+    """
+    values, mask, dtype, _ = _get_values(values, skipna, True, copy=skipna,
+                                         mask=mask)
     return values.all(axis)
 
 
 @disallow('M8')
-def nansum(values, axis=None, skipna=True, min_count=0):
-    values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
+def nansum(values, axis=None, skipna=True, min_count=0, mask=None):
+    """
+    Sum the elements along an axis ignoring NaNs
+
+    Parameters
+    ----------
+    values : ndarray[dtype]
+    axis: int, optional
+    skipna : bool, default True
+    min_count: int, default 0
+    mask : ndarray[bool], optional
+        nan-mask if known
+
+    Returns
+    -------
+    result : dtype
+
+    Examples
+    --------
+    >>> import pandas.core.nanops as nanops
+    >>> s = pd.Series([1, 2, np.nan])
+    >>> nanops.nansum(s)
+    3.0
+    """
+    values, mask, dtype, dtype_max = _get_values(values, skipna, 0, mask=mask)
     dtype_sum = dtype_max
     if is_float_dtype(dtype):
         dtype_sum = dtype
@@ -341,9 +423,32 @@ def nansum(values, axis=None, skipna=True, min_count=0):
 
 @disallow('M8')
 @bottleneck_switch()
-def nanmean(values, axis=None, skipna=True):
-    values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
+def nanmean(values, axis=None, skipna=True, mask=None):
+    """
+    Compute the mean of the element along an axis ignoring NaNs
 
+    Parameters
+    ----------
+    values : ndarray
+    axis: int, optional
+    skipna : bool, default True
+    mask : ndarray[bool], optional
+        nan-mask if known
+
+    Returns
+    -------
+    result : float
+        Unless input is a float array, in which case use the same
+        precision as the input array.
+
+    Examples
+    --------
+    >>> import pandas.core.nanops as nanops
+    >>> s = pd.Series([1, 2, np.nan])
+    >>> nanops.nanmean(s)
+    1.5
+    """
+    values, mask, dtype, dtype_max = _get_values(values, skipna, 0, mask=mask)
     dtype_sum = dtype_max
     dtype_count = np.float64
     if is_integer_dtype(dtype) or is_timedelta64_dtype(dtype):
@@ -367,15 +472,36 @@ def nanmean(values, axis=None, skipna=True):
 
 @disallow('M8')
 @bottleneck_switch()
-def nanmedian(values, axis=None, skipna=True):
+def nanmedian(values, axis=None, skipna=True, mask=None):
+    """
+    Parameters
+    ----------
+    values : ndarray
+    axis: int, optional
+    skipna : bool, default True
+    mask : ndarray[bool], optional
+        nan-mask if known
 
+    Returns
+    -------
+    result : float
+        Unless input is a float array, in which case use the same
+        precision as the input array.
+
+    Examples
+    --------
+    >>> import pandas.core.nanops as nanops
+    >>> s = pd.Series([1, np.nan, 2, 2])
+    >>> nanops.nanmedian(s)
+    2.0
+    """
     def get_median(x):
         mask = notna(x)
         if not skipna and not mask.all():
             return np.nan
         return np.nanmedian(x[mask])
 
-    values, mask, dtype, dtype_max = _get_values(values, skipna)
+    values, mask, dtype, dtype_max = _get_values(values, skipna, mask=mask)
     if not is_float_dtype(values):
         values = values.astype('f8')
         values[mask] = np.nan
@@ -431,18 +557,73 @@ def _get_counts_nanvar(mask, axis, ddof, dtype=float):
 
 @disallow('M8')
 @bottleneck_switch(ddof=1)
-def nanstd(values, axis=None, skipna=True, ddof=1):
-    result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof))
+def nanstd(values, axis=None, skipna=True, ddof=1, mask=None):
+    """
+    Compute the standard deviation along given axis while ignoring NaNs
+
+    Parameters
+    ----------
+    values : ndarray
+    axis: int, optional
+    skipna : bool, default True
+    ddof : int, default 1
+        Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
+        where N represents the number of elements.
+    mask : ndarray[bool], optional
+        nan-mask if known
+
+    Returns
+    -------
+    result : float
+        Unless input is a float array, in which case use the same
+        precision as the input array.
+
+    Examples
+    --------
+    >>> import pandas.core.nanops as nanops
+    >>> s = pd.Series([1, np.nan, 2, 3])
+    >>> nanops.nanstd(s)
+    1.0
+    """
+    result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof,
+                            mask=mask))
     return _wrap_results(result, values.dtype)
 
 
 @disallow('M8')
 @bottleneck_switch(ddof=1)
-def nanvar(values, axis=None, skipna=True, ddof=1):
+def nanvar(values, axis=None, skipna=True, ddof=1, mask=None):
+    """
+    Compute the variance along given axis while ignoring NaNs
+
+    Parameters
+    ----------
+    values : ndarray
+    axis: int, optional
+    skipna : bool, default True
+    ddof : int, default 1
+        Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
+        where N represents the number of elements.
+    mask : ndarray[bool], optional
+        nan-mask if known
 
+    Returns
+    -------
+    result : float
+        Unless input is a float array, in which case use the same
+        precision as the input array.
+
+    Examples
+    --------
+    >>> import pandas.core.nanops as nanops
+    >>> s = pd.Series([1, np.nan, 2, 3])
+    >>> nanops.nanvar(s)
+    1.0
+    """
     values = com.values_from_object(values)
     dtype = values.dtype
-    mask = isna(values)
+    if mask is None:
+        mask = isna(values)
     if is_any_int_dtype(values):
         values = values.astype('f8')
         values[mask] = np.nan
@@ -465,7 +646,7 @@ def nanvar(values, axis=None, skipna=True, ddof=1):
     avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count
     if axis is not None:
         avg = np.expand_dims(avg, axis)
-    sqr = _ensure_numeric((avg - values)**2)
+    sqr = _ensure_numeric((avg - values) ** 2)
     np.putmask(sqr, mask, 0)
     result = sqr.sum(axis=axis, dtype=np.float64) / d
 
@@ -478,12 +659,41 @@ def nanvar(values, axis=None, skipna=True, ddof=1):
 
 
 @disallow('M8', 'm8')
-def nansem(values, axis=None, skipna=True, ddof=1):
+def nansem(values, axis=None, skipna=True, ddof=1, mask=None):
+    """
+    Compute the standard error in the mean along given axis while ignoring NaNs
+
+    Parameters
+    ----------
+    values : ndarray
+    axis: int, optional
+    skipna : bool, default True
+    ddof : int, default 1
+        Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
+        where N represents the number of elements.
+    mask : ndarray[bool], optional
+        nan-mask if known
+
+    Returns
+    -------
+    result : float64
+        Unless input is a float array, in which case use the same
+        precision as the input array.
+
+    Examples
+    --------
+    >>> import pandas.core.nanops as nanops
+    >>> s = pd.Series([1, np.nan, 2, 3])
+    >>> nanops.nansem(s)
+     0.5773502691896258
+    """
+
     # This checks if non-numeric-like data is passed with numeric_only=False
     # and raises a TypeError otherwise
-    nanvar(values, axis, skipna, ddof=ddof)
+    nanvar(values, axis, skipna, ddof=ddof, mask=mask)
 
-    mask = isna(values)
+    if mask is None:
+        mask = isna(values)
     if not is_float_dtype(values.dtype):
         values = values.astype('f8')
     count, _ = _get_counts_nanvar(mask, axis, ddof, values.dtype)
@@ -494,9 +704,9 @@ def nansem(values, axis=None, skipna=True, ddof=1):
 
 def _nanminmax(meth, fill_value_typ):
     @bottleneck_switch()
-    def reduction(values, axis=None, skipna=True):
+    def reduction(values, axis=None, skipna=True, mask=None):
         values, mask, dtype, dtype_max = _get_values(
-            values, skipna, fill_value_typ=fill_value_typ, )
+            values, skipna, fill_value_typ=fill_value_typ, mask=mask)
 
         if ((axis is not None and values.shape[axis] == 0) or
                 values.size == 0):
@@ -521,39 +731,97 @@ def reduction(values, axis=None, skipna=True):
 
 
 @disallow('O')
-def nanargmax(values, axis=None, skipna=True):
+def nanargmax(values, axis=None, skipna=True, mask=None):
     """
-    Returns -1 in the NA case
+    Parameters
+    ----------
+    values : ndarray
+    axis: int, optional
+    skipna : bool, default True
+    mask : ndarray[bool], optional
+        nan-mask if known
+
+    Returns
+    --------
+    result : int
+        The index of max value in specified axis or -1 in the NA case
+
+    Examples
+    --------
+    >>> import pandas.core.nanops as nanops
+    >>> s = pd.Series([1, 2, 3, np.nan, 4])
+    >>> nanops.nanargmax(s)
+    4
     """
-    values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='-inf')
+    values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='-inf',
+                                         mask=mask)
     result = values.argmax(axis)
     result = _maybe_arg_null_out(result, axis, mask, skipna)
     return result
 
 
 @disallow('O')
-def nanargmin(values, axis=None, skipna=True):
+def nanargmin(values, axis=None, skipna=True, mask=None):
     """
-    Returns -1 in the NA case
+    Parameters
+    ----------
+    values : ndarray
+    axis: int, optional
+    skipna : bool, default True
+    mask : ndarray[bool], optional
+        nan-mask if known
+
+    Returns
+    --------
+    result : int
+        The index of min value in specified axis or -1 in the NA case
+
+    Examples
+    --------
+    >>> import pandas.core.nanops as nanops
+    >>> s = pd.Series([1, 2, 3, np.nan, 4])
+    >>> nanops.nanargmin(s)
+    0
     """
-    values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='+inf')
+    values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='+inf',
+                                         mask=mask)
     result = values.argmin(axis)
     result = _maybe_arg_null_out(result, axis, mask, skipna)
     return result
 
 
 @disallow('M8', 'm8')
-def nanskew(values, axis=None, skipna=True):
+def nanskew(values, axis=None, skipna=True, mask=None):
     """ Compute the sample skewness.
 
     The statistic computed here is the adjusted Fisher-Pearson standardized
     moment coefficient G1. The algorithm computes this coefficient directly
     from the second and third central moment.
 
-    """
+    Parameters
+    ----------
+    values : ndarray
+    axis: int, optional
+    skipna : bool, default True
+    mask : ndarray[bool], optional
+        nan-mask if known
 
+    Returns
+    -------
+    result : float64
+        Unless input is a float array, in which case use the same
+        precision as the input array.
+
+    Examples
+    --------
+    >>> import pandas.core.nanops as nanops
+    >>> s = pd.Series([1,np.nan, 1, 2])
+    >>> nanops.nanskew(s)
+    1.7320508075688787
+    """
     values = com.values_from_object(values)
-    mask = isna(values)
+    if mask is None:
+        mask = isna(values)
     if not is_float_dtype(values.dtype):
         values = values.astype('f8')
         count = _get_counts(mask, axis)
@@ -602,16 +870,38 @@ def nanskew(values, axis=None, skipna=True):
 
 
 @disallow('M8', 'm8')
-def nankurt(values, axis=None, skipna=True):
-    """ Compute the sample excess kurtosis.
+def nankurt(values, axis=None, skipna=True, mask=None):
+    """
+    Compute the sample excess kurtosis
 
     The statistic computed here is the adjusted Fisher-Pearson standardized
     moment coefficient G2, computed directly from the second and fourth
     central moment.
 
+    Parameters
+    ----------
+    values : ndarray
+    axis: int, optional
+    skipna : bool, default True
+    mask : ndarray[bool], optional
+        nan-mask if known
+
+    Returns
+    -------
+    result : float64
+        Unless input is a float array, in which case use the same
+        precision as the input array.
+
+    Examples
+    --------
+    >>> import pandas.core.nanops as nanops
+    >>> s = pd.Series([1,np.nan, 1, 3, 2])
+    >>> nanops.nankurt(s)
+    -1.2892561983471076
     """
     values = com.values_from_object(values)
-    mask = isna(values)
+    if mask is None:
+        mask = isna(values)
     if not is_float_dtype(values.dtype):
         values = values.astype('f8')
         count = _get_counts(mask, axis)
@@ -637,7 +927,7 @@ def nankurt(values, axis=None, skipna=True):
     with np.errstate(invalid='ignore', divide='ignore'):
         adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3))
         numer = count * (count + 1) * (count - 1) * m4
-        denom = (count - 2) * (count - 3) * m2**2
+        denom = (count - 2) * (count - 3) * m2 ** 2
 
     # floating point error
     #
@@ -669,8 +959,34 @@ def nankurt(values, axis=None, skipna=True):
 
 
 @disallow('M8', 'm8')
-def nanprod(values, axis=None, skipna=True, min_count=0):
-    mask = isna(values)
+def nanprod(values, axis=None, skipna=True, min_count=0, mask=None):
+    """
+    Parameters
+    ----------
+    values : ndarray[dtype]
+    axis: int, optional
+    skipna : bool, default True
+    min_count: int, default 0
+    mask : ndarray[bool], optional
+        nan-mask if known
+
+    Returns
+    -------
+    result : dtype
+
+    Examples
+    --------
+    >>> import pandas.core.nanops as nanops
+    >>> s = pd.Series([1, 2, 3, np.nan])
+    >>> nanops.nanprod(s)
+    6.0
+
+    Returns
+    --------
+    The product of all elements on a given axis. ( NaNs are treated as 1)
+    """
+    if mask is None:
+        mask = isna(values)
     if skipna and not is_any_int_dtype(values):
         values = values.copy()
         values[mask] = 1
diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py
index b6c2c65fb6dce..b06463d3c07aa 100644
--- a/pandas/tests/test_nanops.py
+++ b/pandas/tests/test_nanops.py
@@ -1,19 +1,19 @@
 # -*- coding: utf-8 -*-
 from __future__ import division, print_function
 
+import warnings
 from functools import partial
 
-import pytest
-import warnings
 import numpy as np
+import pytest
 
 import pandas as pd
-from pandas import Series, isna
-from pandas.core.dtypes.common import is_integer_dtype
 import pandas.core.nanops as nanops
-import pandas.util.testing as tm
 import pandas.util._test_decorators as td
+import pandas.util.testing as tm
+from pandas import Series, isna
 from pandas.compat.numpy import _np_version_under1p13
+from pandas.core.dtypes.common import is_integer_dtype
 
 use_bn = nanops._USE_BOTTLENECK
 
@@ -1041,3 +1041,29 @@ def test_numpy_ops_np_version_under1p13(numpy_op, expected):
             assert result == expected
     else:
         assert result == expected
+
+
+@pytest.mark.parametrize("operation", [
+    nanops.nanany,
+    nanops.nanall,
+    nanops.nansum,
+    nanops.nanmean,
+    nanops.nanmedian,
+    nanops.nanstd,
+    nanops.nanvar,
+    nanops.nansem,
+    nanops.nanargmax,
+    nanops.nanargmin,
+    nanops.nanmax,
+    nanops.nanmin,
+    nanops.nanskew,
+    nanops.nankurt,
+    nanops.nanprod,
+])
+def test_nanops_independent_of_mask_param(operation):
+    # GH22764
+    s = pd.Series([1, 2, np.nan, 3, np.nan, 4])
+    mask = s.isna()
+    median_expected = operation(s)
+    median_result = operation(s, mask=mask)
+    assert median_expected == median_result