From 3e3256bb6038111812b4b28f6b3b049214d83d2d Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Wed, 3 Oct 2018 12:23:22 +0100 Subject: [PATCH] Allow passing a mask to NanOps (#22865) --- pandas/core/nanops.py | 404 ++++++++++++++++++++++++++++++++---- pandas/tests/test_nanops.py | 36 +++- 2 files changed, 391 insertions(+), 49 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 232d030da7f1e..2884bc1a19491 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1,12 +1,16 @@ -import itertools import functools +import itertools import operator import warnings from distutils.version import LooseVersion import numpy as np + +import pandas.core.common as com from pandas import compat from pandas._libs import tslibs, lib +from pandas.core.config import get_option +from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask from pandas.core.dtypes.common import ( _get_dtype, is_float, is_scalar, @@ -17,10 +21,7 @@ is_datetime64_dtype, is_timedelta64_dtype, is_datetime_or_timedelta_dtype, is_int_or_datetime_dtype, is_any_int_dtype) -from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask from pandas.core.dtypes.missing import isna, notna, na_value_for_dtype -from pandas.core.config import get_option -import pandas.core.common as com _BOTTLENECK_INSTALLED = False _MIN_BOTTLENECK_VERSION = '1.0.0' @@ -200,16 +201,18 @@ def _get_fill_value(dtype, fill_value=None, fill_value_typ=None): def _get_values(values, skipna, fill_value=None, fill_value_typ=None, - isfinite=False, copy=True): + isfinite=False, copy=True, mask=None): """ utility to get the values view, mask, dtype if necessary copy and mask using the specified fill_value copy = True will force the copy """ values = com.values_from_object(values) - if isfinite: - mask = _isfinite(values) - else: - mask = isna(values) + + if mask is None: + if isfinite: + mask = _isfinite(values) + else: + mask = isna(values) dtype = values.dtype dtype_ok = _na_ok_dtype(dtype) @@ -315,19 +318,98 @@ def _na_for_min_count(values, axis): return result -def nanany(values, axis=None, skipna=True): - values, mask, dtype, _ = _get_values(values, skipna, False, copy=skipna) +def nanany(values, axis=None, skipna=True, mask=None): + """ + Check if any elements along an axis evaluate to True. + + Parameters + ---------- + values : ndarray + axis : int, optional + skipna : bool, default True + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : bool + + Examples + -------- + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1, 2]) + >>> nanops.nanany(s) + True + + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([np.nan]) + >>> nanops.nanany(s) + False + """ + values, mask, dtype, _ = _get_values(values, skipna, False, copy=skipna, + mask=mask) return values.any(axis) -def nanall(values, axis=None, skipna=True): - values, mask, dtype, _ = _get_values(values, skipna, True, copy=skipna) +def nanall(values, axis=None, skipna=True, mask=None): + """ + Check if all elements along an axis evaluate to True. + + Parameters + ---------- + values : ndarray + axis: int, optional + skipna : bool, default True + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : bool + + Examples + -------- + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1, 2, np.nan]) + >>> nanops.nanall(s) + True + + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1, 0]) + >>> nanops.nanall(s) + False + """ + values, mask, dtype, _ = _get_values(values, skipna, True, copy=skipna, + mask=mask) return values.all(axis) @disallow('M8') -def nansum(values, axis=None, skipna=True, min_count=0): - values, mask, dtype, dtype_max = _get_values(values, skipna, 0) +def nansum(values, axis=None, skipna=True, min_count=0, mask=None): + """ + Sum the elements along an axis ignoring NaNs + + Parameters + ---------- + values : ndarray[dtype] + axis: int, optional + skipna : bool, default True + min_count: int, default 0 + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : dtype + + Examples + -------- + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1, 2, np.nan]) + >>> nanops.nansum(s) + 3.0 + """ + values, mask, dtype, dtype_max = _get_values(values, skipna, 0, mask=mask) dtype_sum = dtype_max if is_float_dtype(dtype): dtype_sum = dtype @@ -341,9 +423,32 @@ def nansum(values, axis=None, skipna=True, min_count=0): @disallow('M8') @bottleneck_switch() -def nanmean(values, axis=None, skipna=True): - values, mask, dtype, dtype_max = _get_values(values, skipna, 0) +def nanmean(values, axis=None, skipna=True, mask=None): + """ + Compute the mean of the element along an axis ignoring NaNs + Parameters + ---------- + values : ndarray + axis: int, optional + skipna : bool, default True + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : float + Unless input is a float array, in which case use the same + precision as the input array. + + Examples + -------- + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1, 2, np.nan]) + >>> nanops.nanmean(s) + 1.5 + """ + values, mask, dtype, dtype_max = _get_values(values, skipna, 0, mask=mask) dtype_sum = dtype_max dtype_count = np.float64 if is_integer_dtype(dtype) or is_timedelta64_dtype(dtype): @@ -367,15 +472,36 @@ def nanmean(values, axis=None, skipna=True): @disallow('M8') @bottleneck_switch() -def nanmedian(values, axis=None, skipna=True): +def nanmedian(values, axis=None, skipna=True, mask=None): + """ + Parameters + ---------- + values : ndarray + axis: int, optional + skipna : bool, default True + mask : ndarray[bool], optional + nan-mask if known + Returns + ------- + result : float + Unless input is a float array, in which case use the same + precision as the input array. + + Examples + -------- + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1, np.nan, 2, 2]) + >>> nanops.nanmedian(s) + 2.0 + """ def get_median(x): mask = notna(x) if not skipna and not mask.all(): return np.nan return np.nanmedian(x[mask]) - values, mask, dtype, dtype_max = _get_values(values, skipna) + values, mask, dtype, dtype_max = _get_values(values, skipna, mask=mask) if not is_float_dtype(values): values = values.astype('f8') values[mask] = np.nan @@ -431,18 +557,73 @@ def _get_counts_nanvar(mask, axis, ddof, dtype=float): @disallow('M8') @bottleneck_switch(ddof=1) -def nanstd(values, axis=None, skipna=True, ddof=1): - result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof)) +def nanstd(values, axis=None, skipna=True, ddof=1, mask=None): + """ + Compute the standard deviation along given axis while ignoring NaNs + + Parameters + ---------- + values : ndarray + axis: int, optional + skipna : bool, default True + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : float + Unless input is a float array, in which case use the same + precision as the input array. + + Examples + -------- + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1, np.nan, 2, 3]) + >>> nanops.nanstd(s) + 1.0 + """ + result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof, + mask=mask)) return _wrap_results(result, values.dtype) @disallow('M8') @bottleneck_switch(ddof=1) -def nanvar(values, axis=None, skipna=True, ddof=1): +def nanvar(values, axis=None, skipna=True, ddof=1, mask=None): + """ + Compute the variance along given axis while ignoring NaNs + + Parameters + ---------- + values : ndarray + axis: int, optional + skipna : bool, default True + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + mask : ndarray[bool], optional + nan-mask if known + Returns + ------- + result : float + Unless input is a float array, in which case use the same + precision as the input array. + + Examples + -------- + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1, np.nan, 2, 3]) + >>> nanops.nanvar(s) + 1.0 + """ values = com.values_from_object(values) dtype = values.dtype - mask = isna(values) + if mask is None: + mask = isna(values) if is_any_int_dtype(values): values = values.astype('f8') values[mask] = np.nan @@ -465,7 +646,7 @@ def nanvar(values, axis=None, skipna=True, ddof=1): avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count if axis is not None: avg = np.expand_dims(avg, axis) - sqr = _ensure_numeric((avg - values)**2) + sqr = _ensure_numeric((avg - values) ** 2) np.putmask(sqr, mask, 0) result = sqr.sum(axis=axis, dtype=np.float64) / d @@ -478,12 +659,41 @@ def nanvar(values, axis=None, skipna=True, ddof=1): @disallow('M8', 'm8') -def nansem(values, axis=None, skipna=True, ddof=1): +def nansem(values, axis=None, skipna=True, ddof=1, mask=None): + """ + Compute the standard error in the mean along given axis while ignoring NaNs + + Parameters + ---------- + values : ndarray + axis: int, optional + skipna : bool, default True + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is N - ddof, + where N represents the number of elements. + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : float64 + Unless input is a float array, in which case use the same + precision as the input array. + + Examples + -------- + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1, np.nan, 2, 3]) + >>> nanops.nansem(s) + 0.5773502691896258 + """ + # This checks if non-numeric-like data is passed with numeric_only=False # and raises a TypeError otherwise - nanvar(values, axis, skipna, ddof=ddof) + nanvar(values, axis, skipna, ddof=ddof, mask=mask) - mask = isna(values) + if mask is None: + mask = isna(values) if not is_float_dtype(values.dtype): values = values.astype('f8') count, _ = _get_counts_nanvar(mask, axis, ddof, values.dtype) @@ -494,9 +704,9 @@ def nansem(values, axis=None, skipna=True, ddof=1): def _nanminmax(meth, fill_value_typ): @bottleneck_switch() - def reduction(values, axis=None, skipna=True): + def reduction(values, axis=None, skipna=True, mask=None): values, mask, dtype, dtype_max = _get_values( - values, skipna, fill_value_typ=fill_value_typ, ) + values, skipna, fill_value_typ=fill_value_typ, mask=mask) if ((axis is not None and values.shape[axis] == 0) or values.size == 0): @@ -521,39 +731,97 @@ def reduction(values, axis=None, skipna=True): @disallow('O') -def nanargmax(values, axis=None, skipna=True): +def nanargmax(values, axis=None, skipna=True, mask=None): """ - Returns -1 in the NA case + Parameters + ---------- + values : ndarray + axis: int, optional + skipna : bool, default True + mask : ndarray[bool], optional + nan-mask if known + + Returns + -------- + result : int + The index of max value in specified axis or -1 in the NA case + + Examples + -------- + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1, 2, 3, np.nan, 4]) + >>> nanops.nanargmax(s) + 4 """ - values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='-inf') + values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='-inf', + mask=mask) result = values.argmax(axis) result = _maybe_arg_null_out(result, axis, mask, skipna) return result @disallow('O') -def nanargmin(values, axis=None, skipna=True): +def nanargmin(values, axis=None, skipna=True, mask=None): """ - Returns -1 in the NA case + Parameters + ---------- + values : ndarray + axis: int, optional + skipna : bool, default True + mask : ndarray[bool], optional + nan-mask if known + + Returns + -------- + result : int + The index of min value in specified axis or -1 in the NA case + + Examples + -------- + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1, 2, 3, np.nan, 4]) + >>> nanops.nanargmin(s) + 0 """ - values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='+inf') + values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='+inf', + mask=mask) result = values.argmin(axis) result = _maybe_arg_null_out(result, axis, mask, skipna) return result @disallow('M8', 'm8') -def nanskew(values, axis=None, skipna=True): +def nanskew(values, axis=None, skipna=True, mask=None): """ Compute the sample skewness. The statistic computed here is the adjusted Fisher-Pearson standardized moment coefficient G1. The algorithm computes this coefficient directly from the second and third central moment. - """ + Parameters + ---------- + values : ndarray + axis: int, optional + skipna : bool, default True + mask : ndarray[bool], optional + nan-mask if known + Returns + ------- + result : float64 + Unless input is a float array, in which case use the same + precision as the input array. + + Examples + -------- + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1,np.nan, 1, 2]) + >>> nanops.nanskew(s) + 1.7320508075688787 + """ values = com.values_from_object(values) - mask = isna(values) + if mask is None: + mask = isna(values) if not is_float_dtype(values.dtype): values = values.astype('f8') count = _get_counts(mask, axis) @@ -602,16 +870,38 @@ def nanskew(values, axis=None, skipna=True): @disallow('M8', 'm8') -def nankurt(values, axis=None, skipna=True): - """ Compute the sample excess kurtosis. +def nankurt(values, axis=None, skipna=True, mask=None): + """ + Compute the sample excess kurtosis The statistic computed here is the adjusted Fisher-Pearson standardized moment coefficient G2, computed directly from the second and fourth central moment. + Parameters + ---------- + values : ndarray + axis: int, optional + skipna : bool, default True + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : float64 + Unless input is a float array, in which case use the same + precision as the input array. + + Examples + -------- + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1,np.nan, 1, 3, 2]) + >>> nanops.nankurt(s) + -1.2892561983471076 """ values = com.values_from_object(values) - mask = isna(values) + if mask is None: + mask = isna(values) if not is_float_dtype(values.dtype): values = values.astype('f8') count = _get_counts(mask, axis) @@ -637,7 +927,7 @@ def nankurt(values, axis=None, skipna=True): with np.errstate(invalid='ignore', divide='ignore'): adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3)) numer = count * (count + 1) * (count - 1) * m4 - denom = (count - 2) * (count - 3) * m2**2 + denom = (count - 2) * (count - 3) * m2 ** 2 # floating point error # @@ -669,8 +959,34 @@ def nankurt(values, axis=None, skipna=True): @disallow('M8', 'm8') -def nanprod(values, axis=None, skipna=True, min_count=0): - mask = isna(values) +def nanprod(values, axis=None, skipna=True, min_count=0, mask=None): + """ + Parameters + ---------- + values : ndarray[dtype] + axis: int, optional + skipna : bool, default True + min_count: int, default 0 + mask : ndarray[bool], optional + nan-mask if known + + Returns + ------- + result : dtype + + Examples + -------- + >>> import pandas.core.nanops as nanops + >>> s = pd.Series([1, 2, 3, np.nan]) + >>> nanops.nanprod(s) + 6.0 + + Returns + -------- + The product of all elements on a given axis. ( NaNs are treated as 1) + """ + if mask is None: + mask = isna(values) if skipna and not is_any_int_dtype(values): values = values.copy() values[mask] = 1 diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index b6c2c65fb6dce..b06463d3c07aa 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -1,19 +1,19 @@ # -*- coding: utf-8 -*- from __future__ import division, print_function +import warnings from functools import partial -import pytest -import warnings import numpy as np +import pytest import pandas as pd -from pandas import Series, isna -from pandas.core.dtypes.common import is_integer_dtype import pandas.core.nanops as nanops -import pandas.util.testing as tm import pandas.util._test_decorators as td +import pandas.util.testing as tm +from pandas import Series, isna from pandas.compat.numpy import _np_version_under1p13 +from pandas.core.dtypes.common import is_integer_dtype use_bn = nanops._USE_BOTTLENECK @@ -1041,3 +1041,29 @@ def test_numpy_ops_np_version_under1p13(numpy_op, expected): assert result == expected else: assert result == expected + + +@pytest.mark.parametrize("operation", [ + nanops.nanany, + nanops.nanall, + nanops.nansum, + nanops.nanmean, + nanops.nanmedian, + nanops.nanstd, + nanops.nanvar, + nanops.nansem, + nanops.nanargmax, + nanops.nanargmin, + nanops.nanmax, + nanops.nanmin, + nanops.nanskew, + nanops.nankurt, + nanops.nanprod, +]) +def test_nanops_independent_of_mask_param(operation): + # GH22764 + s = pd.Series([1, 2, np.nan, 3, np.nan, 4]) + mask = s.isna() + median_expected = operation(s) + median_result = operation(s, mask=mask) + assert median_expected == median_result