diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 2788ac6a600..775bc365cee 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -69,6 +69,7 @@ ListDtype, StructDtype, ) +from cudf.core.mixins import Reducible from cudf.utils import utils from cudf.utils.dtypes import ( cudf_dtype_from_pa_type, @@ -86,7 +87,14 @@ Slice = TypeVar("Slice", bound=slice) -class ColumnBase(Column, Serializable, NotIterable): +class ColumnBase(Column, Serializable, Reducible, NotIterable): + _VALID_REDUCTIONS = { + "any", + "all", + "max", + "min", + } + def as_frame(self) -> "cudf.core.frame.Frame": """ Converts a Column to Frame @@ -674,16 +682,10 @@ def append(self, other: ColumnBase) -> ColumnBase: return concat_columns([self, as_column(other)]) def quantile( - self, - q: Union[float, Sequence[float]], - interpolation: builtins.str, - exact: bool, + self, q: Union[float, Sequence[float]], interpolation: str, exact: bool ) -> ColumnBase: raise TypeError(f"cannot perform quantile with type {self.dtype}") - def median(self, skipna: bool = None) -> ScalarLike: - raise TypeError(f"cannot perform median with type {self.dtype}") - def take( self: T, indices: ColumnBase, nullify: bool = False, check_bounds=True ) -> T: @@ -1162,53 +1164,23 @@ def _minmax(self, skipna: bool = None): return libcudf.reduce.minmax(result_col) return result_col - def min(self, skipna: bool = None, dtype: Dtype = None): - result_col = self._process_for_reduction(skipna=skipna) - if isinstance(result_col, ColumnBase): - return libcudf.reduce.reduce("min", result_col, dtype=dtype) - return result_col - - def max(self, skipna: bool = None, dtype: Dtype = None): - result_col = self._process_for_reduction(skipna=skipna) - if isinstance(result_col, ColumnBase): - return libcudf.reduce.reduce("max", result_col, dtype=dtype) - return result_col - - def sum( - self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 - ): - raise TypeError(f"cannot perform sum with type {self.dtype}") - - def product( - self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 - ): - raise TypeError(f"cannot perform product with type {self.dtype}") - - def mean(self, skipna: bool = None, dtype: Dtype = None): - raise TypeError(f"cannot perform mean with type {self.dtype}") - - def std(self, skipna: bool = None, ddof=1, dtype: Dtype = np.float64): - raise TypeError(f"cannot perform std with type {self.dtype}") - - def var(self, skipna: bool = None, ddof=1, dtype: Dtype = np.float64): - raise TypeError(f"cannot perform var with type {self.dtype}") - - def kurtosis(self, skipna: bool = None): - raise TypeError(f"cannot perform kurtosis with type {self.dtype}") + def _reduce( + self, op: str, skipna: bool = None, min_count: int = 0, *args, **kwargs + ) -> ScalarLike: + """Compute {op} of column values. - def skew(self, skipna: bool = None): - raise TypeError(f"cannot perform skew with type {self.dtype}") - - def cov(self, other: ColumnBase): - raise TypeError( - f"cannot perform covarience with types {self.dtype}, " - f"{other.dtype}" - ) - - def corr(self, other: ColumnBase): - raise TypeError( - f"cannot perform corr with types {self.dtype}, {other.dtype}" + skipna : bool + Whether or not na values must be skipped. + min_count : int, default 0 + The minimum number of entries for the reduction, otherwise the + reduction returns NaN. + """ + preprocessed = self._process_for_reduction( + skipna=skipna, min_count=min_count ) + if isinstance(preprocessed, ColumnBase): + return libcudf.reduce.reduce(op, preprocessed, **kwargs) + return preprocessed @property def contains_na_entries(self) -> bool: diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index c72fb66addc..44e5c6f62f1 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. from __future__ import annotations @@ -346,17 +346,27 @@ def as_string_column( column.column_empty(0, dtype="object", masked=False), ) - def mean(self, skipna=None, dtype=np.float64) -> ScalarLike: + def mean( + self, skipna=None, min_count: int = 0, dtype=np.float64 + ) -> ScalarLike: return pd.Timestamp( - self.as_numerical.mean(skipna=skipna, dtype=dtype), + self.as_numerical.mean( + skipna=skipna, min_count=min_count, dtype=dtype + ), unit=self.time_unit, ) def std( - self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64 + self, + skipna: bool = None, + min_count: int = 0, + dtype: Dtype = np.float64, + ddof: int = 1, ) -> pd.Timedelta: return pd.Timedelta( - self.as_numerical.std(skipna=skipna, ddof=ddof, dtype=dtype) + self.as_numerical.std( + skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof + ) * _numpy_to_pandas_conversion[self.time_unit], ) diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 1f84cb88e37..db333328692 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. """Define an interface for columns that can perform numerical operations.""" from __future__ import annotations @@ -10,7 +10,7 @@ import cudf from cudf import _lib as libcudf -from cudf._typing import Dtype, ScalarLike +from cudf._typing import ScalarLike from cudf.core.column import ColumnBase @@ -23,59 +23,14 @@ class NumericalBaseColumn(ColumnBase): point, should be encoded here. """ - def reduce( - self, op: str, skipna: bool = None, min_count: int = 0, **kwargs - ) -> ScalarLike: - """Perform a reduction operation. - - op : str - The operation to perform. - skipna : bool - Whether or not na values must be - """ - preprocessed = self._process_for_reduction( - skipna=skipna, min_count=min_count - ) - if isinstance(preprocessed, ColumnBase): - return libcudf.reduce.reduce(op, preprocessed, **kwargs) - else: - return preprocessed - - def sum( - self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 - ) -> ScalarLike: - return self.reduce( - "sum", skipna=skipna, dtype=dtype, min_count=min_count - ) - - def product( - self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 - ) -> ScalarLike: - return self.reduce( - "product", skipna=skipna, dtype=dtype, min_count=min_count - ) - - def mean( - self, skipna: bool = None, dtype: Dtype = np.float64 - ) -> ScalarLike: - return self.reduce("mean", skipna=skipna, dtype=dtype) - - def var( - self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64 - ) -> ScalarLike: - return self.reduce("var", skipna=skipna, dtype=dtype, ddof=ddof) - - def std( - self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64 - ) -> ScalarLike: - return self.reduce("std", skipna=skipna, dtype=dtype, ddof=ddof) - - def sum_of_squares( - self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 - ) -> ScalarLike: - return self.reduce( - "sum_of_squares", skipna=skipna, dtype=dtype, min_count=min_count - ) + _VALID_REDUCTIONS = { + "sum", + "product", + "sum_of_squares", + "mean", + "var", + "std", + } def _can_return_nan(self, skipna: bool = None) -> bool: return not skipna and self.has_nulls() @@ -148,6 +103,25 @@ def quantile( ) return result + def mean(self, skipna: bool = None, min_count: int = 0, dtype=np.float64): + return self._reduce( + "mean", skipna=skipna, min_count=min_count, dtype=dtype + ) + + def var( + self, skipna: bool = None, min_count: int = 0, dtype=np.float64, ddof=1 + ): + return self._reduce( + "var", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof + ) + + def std( + self, skipna: bool = None, min_count: int = 0, dtype=np.float64, ddof=1 + ): + return self._reduce( + "std", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof + ) + def median(self, skipna: bool = None) -> NumericalBaseColumn: skipna = True if skipna is None else skipna @@ -171,7 +145,7 @@ def _numeric_quantile( self, quant, interpolation, sorted_indices, exact ) - def cov(self, other: ColumnBase) -> float: + def cov(self, other: NumericalBaseColumn) -> float: if ( len(self) == 0 or len(other) == 0 @@ -183,7 +157,7 @@ def cov(self, other: ColumnBase) -> float: cov_sample = result.sum() / (len(self) - 1) return cov_sample - def corr(self, other: ColumnBase) -> float: + def corr(self, other: NumericalBaseColumn) -> float: if len(self) == 0 or len(other) == 0: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 982ac098bbf..396e36a803a 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5154,7 +5154,7 @@ def to_arrow(self) -> pa.Array: return super().to_arrow() def sum( - self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0 + self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0, ): result_col = self._process_for_reduction( skipna=skipna, min_count=min_count diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 6c8c904e13c..7a5f777e88e 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from __future__ import annotations @@ -385,20 +385,29 @@ def quantile( return result.astype(self.dtype) def sum( - self, skipna: bool = None, dtype: Dtype = None, min_count=0 + self, skipna: bool = None, min_count: int = 0, dtype: Dtype = None, ) -> pd.Timedelta: return pd.Timedelta( - self.as_numerical.sum( - skipna=skipna, dtype=dtype, min_count=min_count + # Since sum isn't overriden in Numerical[Base]Column, mypy only + # sees the signature from Reducible (which doesn't have the extra + # parameters from ColumnBase._reduce) so we have to ignore this. + self.as_numerical.sum( # type: ignore + skipna=skipna, min_count=min_count, dtype=dtype ), unit=self.time_unit, ) def std( - self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64 + self, + skipna: bool = None, + min_count: int = 0, + dtype: Dtype = np.float64, + ddof: int = 1, ) -> pd.Timedelta: return pd.Timedelta( - self.as_numerical.std(skipna=skipna, ddof=ddof, dtype=dtype), + self.as_numerical.std( + skipna=skipna, min_count=min_count, ddof=ddof, dtype=dtype + ), unit=self.time_unit, ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 6b5f3809c98..ecd3c2423d5 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5417,10 +5417,13 @@ def _reduce( axis = self._get_axis_from_axis_arg(axis) if axis == 0: - result = [ - getattr(self._data[col], op)(**kwargs) - for col in self._data.names - ] + try: + result = [ + getattr(self._data[col], op)(**kwargs) + for col in self._data.names + ] + except AttributeError: + raise TypeError(f"cannot perform {op} with type {self.dtype}") return Series._from_data( {None: result}, as_index(self._data.names) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 0ad6a9dc35a..b76f5dcc261 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -16,6 +16,7 @@ from cudf.api.types import is_list_like from cudf.core.abc import Serializable from cudf.core.column.column import arange, as_column +from cudf.core.mixins import Reducible from cudf.core.multiindex import MultiIndex from cudf.utils.utils import GetAttrGetItemMixin @@ -35,7 +36,23 @@ def _quantile_75(x): return x.quantile(0.75) -class GroupBy(Serializable): +class GroupBy(Serializable, Reducible): + + _VALID_REDUCTIONS = { + "sum", + "prod", + "idxmin", + "idxmax", + "min", + "max", + "mean", + "median", + "nunique", + "first", + "last", + "var", + "std", + } _MAX_GROUPS_BEFORE_WARN = 100 @@ -296,6 +313,46 @@ def agg(self, func): return result + def _reduce( + self, + op: str, + numeric_only: bool = False, + min_count: int = 0, + *args, + **kwargs, + ): + """Compute {op} of group values. + + Parameters + ---------- + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to + use everything, then use only numeric data. + min_count : int, default 0 + The required number of valid values to perform the operation. If + fewer than ``min_count`` non-NA values are present the result will + be NA. + + Returns + ------- + Series or DataFrame + Computed {op} of values within each group. + + Notes + ----- + Difference from pandas: + * Not supporting: numeric_only, min_count + """ + if numeric_only: + raise NotImplementedError( + "numeric_only parameter is not implemented yet" + ) + if min_count != 0: + raise NotImplementedError( + "min_count parameter is not implemented yet" + ) + return self.agg(op) + aggregate = agg def nth(self, n): @@ -812,38 +869,6 @@ def describe(self, include=None, exclude=None): ) return res - def sum(self): - """Compute the column-wise sum of the values in each group.""" - return self.agg("sum") - - def prod(self): - """Compute the column-wise product of the values in each group.""" - return self.agg("prod") - - def idxmin(self): - """Get the column-wise index of the minimum value in each group.""" - return self.agg("idxmin") - - def idxmax(self): - """Get the column-wise index of the maximum value in each group.""" - return self.agg("idxmax") - - def min(self): - """Get the column-wise minimum value in each group.""" - return self.agg("min") - - def max(self): - """Get the column-wise maximum value in each group.""" - return self.agg("max") - - def mean(self): - """Compute the column-wise mean of the values in each group.""" - return self.agg("mean") - - def median(self): - """Get the column-wise median of the values in each group.""" - return self.agg("median") - def corr(self, method="pearson", min_periods=1): """ Compute pairwise correlation of columns, excluding NA/null values. @@ -1177,10 +1202,6 @@ def func(x): return self.agg(func) - def nunique(self): - """Compute the number of unique values in each column in each group.""" - return self.agg("nunique") - def collect(self): """Get a list of all the values for each column in each group.""" return self.agg("collect") @@ -1202,14 +1223,6 @@ def cummax(self): """Get the column-wise cumulative maximum value in each group.""" return self.agg("cummax") - def first(self): - """Get the first non-null value in each group.""" - return self.agg("first") - - def last(self): - """Get the last non-null value in each group.""" - return self.agg("last") - def diff(self, periods=1, axis=0): """Get the difference between the values in each group. @@ -1539,12 +1552,6 @@ def __getitem__(self, key): by=self.grouping.keys, dropna=self._dropna, sort=self._sort ) - def nunique(self): - """ - Return the number of unique values per group - """ - return self.agg("nunique") - class SeriesGroupBy(GroupBy): """ diff --git a/python/cudf/cudf/core/mixins/__init__.py b/python/cudf/cudf/core/mixins/__init__.py new file mode 100644 index 00000000000..cecf4c1c7ed --- /dev/null +++ b/python/cudf/cudf/core/mixins/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +from .reductions import Reducible + +__all__ = ["Reducible"] diff --git a/python/cudf/cudf/core/mixins/mixin_factory.py b/python/cudf/cudf/core/mixins/mixin_factory.py new file mode 100644 index 00000000000..ecb18f61830 --- /dev/null +++ b/python/cudf/cudf/core/mixins/mixin_factory.py @@ -0,0 +1,259 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +import inspect + + +# `functools.partialmethod` does not allow setting attributes such as +# __doc__ on the resulting method. So we use a simple alternative to +# it here: +def _partialmethod(method, *args1, **kwargs1): + def wrapper(self, *args2, **kwargs2): + return method(self, *args1, *args2, **kwargs1, **kwargs2) + + return wrapper + + +class Operation: + """Descriptor used to define operations for delegating mixins. + + This class is designed to be assigned to the attributes (the delegating + methods) defined by the OperationMixin. This class will create the method + and mimic all the expected attributes for that method to appear as though + it was originally designed on the class. The use of the descriptor pattern + ensures that the method is only created the first time it is invoked, after + which all further calls use the callable generated on the first invocation. + + Parameters + ---------- + name : str + The name of the operation. + docstring_format_args : str + The attribute of the owning class from which to pull format parameters + for this operation's docstring. + base_operation : str + The underlying operation function to be invoked when operation `name` + is called on the owning class. + """ + + def __init__(self, name, docstring_format_args, base_operation): + self._name = name + self._docstring_format_args = docstring_format_args + self._base_operation = base_operation + + def __get__(self, obj, owner=None): + retfunc = _partialmethod(self._base_operation, op=self._name) + + # Required attributes that will exist. + retfunc.__name__ = self._name + retfunc.__qualname__ = ".".join([owner.__name__, self._name]) + retfunc.__module__ = self._base_operation.__module__ + + if self._base_operation.__doc__ is not None: + retfunc.__doc__ = self._base_operation.__doc__.format( + cls=owner.__name__, + op=self._name, + **self._docstring_format_args, + ) + + retfunc.__annotations__ = self._base_operation.__annotations__.copy() + retfunc.__annotations__.pop("op", None) + retfunc_params = [ + v + for k, v in inspect.signature( + self._base_operation + ).parameters.items() + if k != "op" + ] + retfunc.__signature__ = inspect.Signature(retfunc_params) + + setattr(owner, self._name, retfunc) + + if obj is None: + return getattr(owner, self._name) + else: + return getattr(obj, self._name) + + +def _should_define_operation(cls, operation, base_operation_name): + if operation not in dir(cls): + return True + + # If the class doesn't override the base operation we stick to whatever + # parent implementation exists. + if base_operation_name not in cls.__dict__: + return False + + # At this point we know that the class has the operation defined but it + # also overrides the base operation. Since this function is called before + # the operation is defined on the current class, we know that it inherited + # the operation from a parent. We therefore have two possibilities: + # 1. A parent class manually defined the operation. That override takes + # precedence even if the current class defined the base operation. + # 2. A parent class has an auto-generated operation, i.e. it is of type + # Operation and was created by OperationMixin.__init_subclass__. The + # current class must override it so that its base operation is used + # rather than the parent's base operation. + for base_cls in cls.__mro__: + # The first attribute in the MRO is the one that will be used. + if operation in base_cls.__dict__: + return isinstance(base_cls.__dict__[operation], Operation) + + # This line should be unreachable since we know the attribute exists + # somewhere in the MRO if the for loop was entered. + assert False, "Operation attribute not found in hierarchy." + + +def _create_delegating_mixin( + mixin_name, + docstring, + category_name, + base_operation_name, + supported_operations, +): + """Factory for mixins defining collections of delegated operations. + + This function generates mixins based on two common paradigms in cuDF: + + 1. libcudf groups many operations into categories using a common API. These + APIs usually accept an enum to delineate the specific operation to + perform, e.g. binary operations use the `binary_operator` enum when + calling the `binary_operation` function. cuDF Python mimics this + structure by having operations within a category delegate to a common + internal function (e.g. DataFrame.__add__ calls DataFrame._binaryop). + 2. Many cuDF classes implement similar operations (e.g. `sum`) via + delegation to lower-level APIs before reaching a libcudf C++ function + call. As a result, many API function calls actually involve multiple + delegations to lower-level APIs that can look essentially identical. An + example of such a sequence would be DataFrame.sum -> DataFrame._reduce + -> Column.sum -> Column._reduce -> libcudf. + + This factory creates mixins for a category of operations implemented by via + this delegator pattern. The resulting mixins make it easy to share common + functions across various classes while also providing a common entrypoint + for implementing the centralized logic for a given category of operations. + Its usage is best demonstrated by example below. + + Parameters + ---------- + mixin_name : str + The name of the class. This argument should be the same as the object + that this function's output is assigned to, e.g. + :code:`Baz = _create_delegating_mixin("Baz", ...)`. + docstring : str + The documentation string for the mixin class. + category_name : str + The category of operations for which a mixin is being created. This + name will be used to define or access the following attributes as shown + in the example below: + - f'_{category_name}_DOCSTRINGS' + - f'_VALID_{category_name}S' # The subset of ops a subclass allows + - f'_SUPPORTED_{category_name}S' # The ops supported by the mixin + base_operation_name : str + The name given to the core function implementing this category of + operations. The corresponding function is the entrypoint for child + classes. + supported_ops : List[str] + The list of valid operations that subclasses of the resulting mixin may + request to be implemented. + + Examples + -------- + >>> # The class below: + >>> class Person: + ... def _greet(self, op): + ... print(op) + ... + ... def hello(self): + ... self._greet("hello") + ... + ... def goodbye(self): + ... self._greet("goodbye") + >>> # can be rewritten using a delegating mixin as follows: + >>> Greeter = _create_delegating_mixin( + ... "Greeter", "", "GREETING", "_greet", {"hello", "goodbye", "hey"} + ... ) + >>> # The `hello` and `goodbye` methods will now be automatically generated + >>> # for the Person class below. + >>> class Person(Greeter): + ... _VALID_GREETINGS = {"hello", "goodbye"} + ... + ... def _greet(self, op: str): + ... '''Say {op}.''' + ... print(op) + >>> mom = Person() + >>> mom.hello() + hello + >>> # The Greeter class could also enable the `hey` method, but Person did + >>> # not include it in the _VALID_GREETINGS set so it will not exist. + >>> mom.hey() + Traceback (most recent call last): + ... + AttributeError: 'Person' object has no attribute 'hey' + >>> # The docstrings for each method are generated by formatting the _greet + >>> # docstring with the operation name as well as any additional keys + >>> # provided via the _GREETING_DOCSTRINGS parameter. + >>> print(mom.hello.__doc__) + Say hello. + """ + # The first two attributes may be defined on subclasses of the generated + # OperationMixin to indicate valid attributes and parameters to use when + # formatting docstrings. The supported_attr will be defined on the + # OperationMixin itself to indicate what operations its subclass may + # inherit from it. + validity_attr = f"_VALID_{category_name}S" + docstring_attr = f"_{category_name}_DOCSTRINGS" + supported_attr = f"_SUPPORTED_{category_name}S" + + class OperationMixin: + @classmethod + def __init_subclass__(cls): + # Support composition of various OperationMixins. Note that since + # this __init_subclass__ is defined on mixins, it does not prohibit + # classes that inherit it from implementing this method as well as + # long as those implementations also include this super call. + super().__init_subclass__() + + # Only add the valid set of operations for a particular class. + valid_operations = set() + for base_cls in cls.__mro__: + valid_operations |= getattr(base_cls, validity_attr, set()) + + invalid_operations = valid_operations - supported_operations + assert ( + len(invalid_operations) == 0 + ), f"Invalid requested operations: {invalid_operations}" + + base_operation = getattr(cls, base_operation_name) + for operation in valid_operations: + if _should_define_operation( + cls, operation, base_operation_name + ): + docstring_format_args = getattr( + cls, docstring_attr, {} + ).get(operation, {}) + op_attr = Operation( + operation, docstring_format_args, base_operation + ) + setattr(cls, operation, op_attr) + + OperationMixin.__name__ = mixin_name + OperationMixin.__qualname__ = mixin_name + OperationMixin.__doc__ = docstring + + def _operation(self, op: str, *args, **kwargs): + raise NotImplementedError + + _operation.__name__ = base_operation_name + _operation.__qualname__ = ".".join([mixin_name, base_operation_name]) + _operation.__doc__ = ( + f"The core {category_name.lower()} function. Must be overridden by " + "subclasses, the default implementation raises a NotImplementedError." + ) + + setattr(OperationMixin, base_operation_name, _operation) + # This attribute is set in case lookup is convenient at a later point, but + # it is not strictly necessary since `supported_operations` is part of the + # closure associated with the class's creation. + setattr(OperationMixin, supported_attr, supported_operations) + + return OperationMixin diff --git a/python/cudf/cudf/core/mixins/reductions.py b/python/cudf/cudf/core/mixins/reductions.py new file mode 100644 index 00000000000..f73f0e8fbc6 --- /dev/null +++ b/python/cudf/cudf/core/mixins/reductions.py @@ -0,0 +1,35 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +from .mixin_factory import _create_delegating_mixin + +Reducible = _create_delegating_mixin( + "Reducible", + "Mixin encapsulating reduction operations.", + "REDUCTION", + "_reduce", + { + "sum", + "product", + "min", + "max", + "count", + "any", + "all", + "sum_of_squares", + "mean", + "var", + "std", + "median", + "argmax", + "argmin", + "nunique", + "nth", + "collect", + "unique", + "prod", + "idxmin", + "idxmax", + "first", + "last", + }, +) diff --git a/python/cudf/cudf/core/mixins/reductions.pyi b/python/cudf/cudf/core/mixins/reductions.pyi new file mode 100644 index 00000000000..600f30e9372 --- /dev/null +++ b/python/cudf/cudf/core/mixins/reductions.pyi @@ -0,0 +1,70 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +from __future__ import annotations + +class Reducible: + def sum(self): + ... + + def product(self): + ... + + def min(self): + ... + + def max(self): + ... + + def count(self): + ... + + def any(self): + ... + + def all(self): + ... + + def sum_of_squares(self): + ... + + def mean(self): + ... + + def var(self): + ... + + def std(self): + ... + + def median(self): + ... + + def argmax(self): + ... + + def argmin(self): + ... + + def nunique(self): + ... + + def nth(self): + ... + + def collect(self): + ... + + def prod(self): + ... + + def idxmin(self): + ... + + def idxmax(self): + ... + + def first(self): + ... + + def last(self): + ... diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 8574a152c44..fc692b54fa2 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2539,7 +2539,13 @@ def cov(self, other, min_periods=None): lhs, rhs = _align_indices([lhs, rhs], how="inner") - return lhs._column.cov(rhs._column) + try: + return lhs._column.cov(rhs._column) + except AttributeError: + raise TypeError( + f"cannot perform covariance with types {self.dtype}, " + f"{other.dtype}" + ) def transpose(self): """Return the transpose, which is by definition self. @@ -2575,7 +2581,12 @@ def corr(self, other, method="pearson", min_periods=None): rhs = other.nans_to_nulls().dropna() lhs, rhs = _align_indices([lhs, rhs], how="inner") - return lhs._column.corr(rhs._column) + try: + return lhs._column.corr(rhs._column) + except AttributeError: + raise TypeError( + f"cannot perform corr with types {self.dtype}, {other.dtype}" + ) def autocorr(self, lag=1): """Compute the lag-N autocorrelation. This method computes the Pearson diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 50b206d3388..0e49d20b756 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -46,7 +46,10 @@ def _reduce( raise NotImplementedError( "numeric_only parameter is not implemented yet" ) - return getattr(self._column, op)(**kwargs) + try: + return getattr(self._column, op)(**kwargs) + except AttributeError: + raise TypeError(f"cannot perform {op} with type {self.dtype}") def _scan(self, op, axis=None, *args, **kwargs): if axis not in (None, 0): diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 2282a435ed3..fa482d52104 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -11,11 +11,12 @@ from cudf.api.types import is_integer, is_number from cudf.core import column from cudf.core.column.column import as_column +from cudf.core.mixins import Reducible from cudf.utils import cudautils from cudf.utils.utils import GetAttrGetItemMixin -class Rolling(GetAttrGetItemMixin): +class Rolling(GetAttrGetItemMixin, Reducible): """ Rolling window calculations. @@ -163,6 +164,15 @@ class Rolling(GetAttrGetItemMixin): _time_window = False + _VALID_REDUCTIONS = { + "sum", + "min", + "max", + "mean", + "var", + "std", + } + def __init__( self, obj, @@ -262,17 +272,17 @@ def _apply_agg(self, agg_name): else: return self._apply_agg_dataframe(self.obj, agg_name) - def sum(self): - return self._apply_agg("sum") - - def min(self): - return self._apply_agg("min") - - def max(self): - return self._apply_agg("max") + def _reduce( + self, op: str, *args, **kwargs, + ): + """Calculate the rolling {op}. - def mean(self): - return self._apply_agg("mean") + Returns + ------- + Series or DataFrame + Return type is the same as the original object. + """ + return self._apply_agg(op) def var(self, ddof=1): self.agg_params["ddof"] = ddof diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index cb3a369d067..7bf339d6ab7 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -1,6 +1,5 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. -import re from concurrent.futures import ThreadPoolExecutor import numpy as np @@ -513,9 +512,7 @@ def test_cov_corr_invalid_dtypes(gsr): rfunc=gsr.corr, lfunc_args_and_kwargs=([psr],), rfunc_args_and_kwargs=([gsr],), - expected_error_message=re.escape( - f"cannot perform corr with types {gsr.dtype}, {gsr.dtype}" - ), + compare_error_message=False, ) assert_exceptions_equal( @@ -523,7 +520,5 @@ def test_cov_corr_invalid_dtypes(gsr): rfunc=gsr.cov, lfunc_args_and_kwargs=([psr],), rfunc_args_and_kwargs=([gsr],), - expected_error_message=re.escape( - f"cannot perform covarience with types {gsr.dtype}, {gsr.dtype}" - ), + compare_error_message=False, )