Implement a mixin for reductions (#9925)

This PR implements a factory for mixins classes based on the common pattern in cuDF of categories of similar functions all calling a common method implementing some standard pre/post-processing before calling a lower-level API of either one of its members (e.g. `Frames` calling `Column` methods) or the C++ libcudf library. When added to another class, these mixins support customization of which methods are exposed via a class member set of method names. Documentation for these methods is generated by formatting the docstring for the common internal method, e.g. `_reduce` for reductions. As a first pass, this PR generates a single mixin for reductions and applies it to all the relevant classes. Future PRs will use this to generate classes for scans, binary operations, and unary operations, and perhaps other similar categories as they are uncovered. This approach assumes a great deal of API homogeneity between the different methods in a category. `Frame` violates this assumption because similar operations often support slightly different parameters (for instance, some reductions support a `min_count` parameter), so for now `Frame` was not made `Reducible`. That decision could be revisited if 1) the degree of homogeneity of these function signatures increases over time, or 2) we can introduce greater customization into these mixins without adding too much complexity. A first attempt of (2) can be seen in [this branch](https://github.com/vyasr/cudf/tree/refactor/reductions_extended), but the degree of additional complexity just to support `Frame` isn't really justifiable at this stage, so unless we can come up with a simpler solution I recommend leaving `Frame` as is for now. Authors: - Vyas Ramasubramani (https://github.com/vyasr) - Ashwin Srinath (https://github.com/shwina) Approvers: - Michael Wang (https://github.com/isVoid) - Ashwin Srinath (https://github.com/shwina) URL: #9925
rapidsai · Feb 25, 2022 · 21325e8 · 21325e8
1 parent e0af727
commit 21325e8
Show file tree

Hide file tree

Showing 15 changed files with 562 additions and 199 deletions.
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -69,6 +69,7 @@
     ListDtype,
     StructDtype,
 )
+from cudf.core.mixins import Reducible
 from cudf.utils import utils
 from cudf.utils.dtypes import (
     cudf_dtype_from_pa_type,
@@ -86,7 +87,14 @@
 Slice = TypeVar("Slice", bound=slice)
 
 
-class ColumnBase(Column, Serializable, NotIterable):
+class ColumnBase(Column, Serializable, Reducible, NotIterable):
+    _VALID_REDUCTIONS = {
+        "any",
+        "all",
+        "max",
+        "min",
+    }
+
     def as_frame(self) -> "cudf.core.frame.Frame":
         """
         Converts a Column to Frame
@@ -674,16 +682,10 @@ def append(self, other: ColumnBase) -> ColumnBase:
         return concat_columns([self, as_column(other)])
 
     def quantile(
-        self,
-        q: Union[float, Sequence[float]],
-        interpolation: builtins.str,
-        exact: bool,
+        self, q: Union[float, Sequence[float]], interpolation: str, exact: bool
     ) -> ColumnBase:
         raise TypeError(f"cannot perform quantile with type {self.dtype}")
 
-    def median(self, skipna: bool = None) -> ScalarLike:
-        raise TypeError(f"cannot perform median with type {self.dtype}")
-
     def take(
         self: T, indices: ColumnBase, nullify: bool = False, check_bounds=True
     ) -> T:
@@ -1162,53 +1164,23 @@ def _minmax(self, skipna: bool = None):
             return libcudf.reduce.minmax(result_col)
         return result_col
 
-    def min(self, skipna: bool = None, dtype: Dtype = None):
-        result_col = self._process_for_reduction(skipna=skipna)
-        if isinstance(result_col, ColumnBase):
-            return libcudf.reduce.reduce("min", result_col, dtype=dtype)
-        return result_col
-
-    def max(self, skipna: bool = None, dtype: Dtype = None):
-        result_col = self._process_for_reduction(skipna=skipna)
-        if isinstance(result_col, ColumnBase):
-            return libcudf.reduce.reduce("max", result_col, dtype=dtype)
-        return result_col
-
-    def sum(
-        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
-    ):
-        raise TypeError(f"cannot perform sum with type {self.dtype}")
-
-    def product(
-        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
-    ):
-        raise TypeError(f"cannot perform product with type {self.dtype}")
-
-    def mean(self, skipna: bool = None, dtype: Dtype = None):
-        raise TypeError(f"cannot perform mean with type {self.dtype}")
-
-    def std(self, skipna: bool = None, ddof=1, dtype: Dtype = np.float64):
-        raise TypeError(f"cannot perform std with type {self.dtype}")
-
-    def var(self, skipna: bool = None, ddof=1, dtype: Dtype = np.float64):
-        raise TypeError(f"cannot perform var with type {self.dtype}")
-
-    def kurtosis(self, skipna: bool = None):
-        raise TypeError(f"cannot perform kurtosis with type {self.dtype}")
+    def _reduce(
+        self, op: str, skipna: bool = None, min_count: int = 0, *args, **kwargs
+    ) -> ScalarLike:
+        """Compute {op} of column values.
 
-    def skew(self, skipna: bool = None):
-        raise TypeError(f"cannot perform skew with type {self.dtype}")
-
-    def cov(self, other: ColumnBase):
-        raise TypeError(
-            f"cannot perform covarience with types {self.dtype}, "
-            f"{other.dtype}"
-        )
-
-    def corr(self, other: ColumnBase):
-        raise TypeError(
-            f"cannot perform corr with types {self.dtype}, {other.dtype}"
+        skipna : bool
+            Whether or not na values must be skipped.
+        min_count : int, default 0
+            The minimum number of entries for the reduction, otherwise the
+            reduction returns NaN.
+        """
+        preprocessed = self._process_for_reduction(
+            skipna=skipna, min_count=min_count
         )
+        if isinstance(preprocessed, ColumnBase):
+            return libcudf.reduce.reduce(op, preprocessed, **kwargs)
+        return preprocessed
 
     @property
     def contains_na_entries(self) -> bool:

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -346,17 +346,27 @@ def as_string_column(
                 column.column_empty(0, dtype="object", masked=False),
             )
 
-    def mean(self, skipna=None, dtype=np.float64) -> ScalarLike:
+    def mean(
+        self, skipna=None, min_count: int = 0, dtype=np.float64
+    ) -> ScalarLike:
         return pd.Timestamp(
-            self.as_numerical.mean(skipna=skipna, dtype=dtype),
+            self.as_numerical.mean(
+                skipna=skipna, min_count=min_count, dtype=dtype
+            ),
             unit=self.time_unit,
         )
 
     def std(
-        self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64
+        self,
+        skipna: bool = None,
+        min_count: int = 0,
+        dtype: Dtype = np.float64,
+        ddof: int = 1,
     ) -> pd.Timedelta:
         return pd.Timedelta(
-            self.as_numerical.std(skipna=skipna, ddof=ddof, dtype=dtype)
+            self.as_numerical.std(
+                skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
+            )
             * _numpy_to_pandas_conversion[self.time_unit],
         )
 

diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 """Define an interface for columns that can perform numerical operations."""
 
 from __future__ import annotations
@@ -10,7 +10,7 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._typing import Dtype, ScalarLike
+from cudf._typing import ScalarLike
 from cudf.core.column import ColumnBase
 
 
@@ -23,59 +23,14 @@ class NumericalBaseColumn(ColumnBase):
     point, should be encoded here.
     """
 
-    def reduce(
-        self, op: str, skipna: bool = None, min_count: int = 0, **kwargs
-    ) -> ScalarLike:
-        """Perform a reduction operation.
-
-        op : str
-            The operation to perform.
-        skipna : bool
-            Whether or not na values must be
-        """
-        preprocessed = self._process_for_reduction(
-            skipna=skipna, min_count=min_count
-        )
-        if isinstance(preprocessed, ColumnBase):
-            return libcudf.reduce.reduce(op, preprocessed, **kwargs)
-        else:
-            return preprocessed
-
-    def sum(
-        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
-    ) -> ScalarLike:
-        return self.reduce(
-            "sum", skipna=skipna, dtype=dtype, min_count=min_count
-        )
-
-    def product(
-        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
-    ) -> ScalarLike:
-        return self.reduce(
-            "product", skipna=skipna, dtype=dtype, min_count=min_count
-        )
-
-    def mean(
-        self, skipna: bool = None, dtype: Dtype = np.float64
-    ) -> ScalarLike:
-        return self.reduce("mean", skipna=skipna, dtype=dtype)
-
-    def var(
-        self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64
-    ) -> ScalarLike:
-        return self.reduce("var", skipna=skipna, dtype=dtype, ddof=ddof)
-
-    def std(
-        self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64
-    ) -> ScalarLike:
-        return self.reduce("std", skipna=skipna, dtype=dtype, ddof=ddof)
-
-    def sum_of_squares(
-        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
-    ) -> ScalarLike:
-        return self.reduce(
-            "sum_of_squares", skipna=skipna, dtype=dtype, min_count=min_count
-        )
+    _VALID_REDUCTIONS = {
+        "sum",
+        "product",
+        "sum_of_squares",
+        "mean",
+        "var",
+        "std",
+    }
 
     def _can_return_nan(self, skipna: bool = None) -> bool:
         return not skipna and self.has_nulls()
@@ -148,6 +103,25 @@ def quantile(
             )
         return result
 
+    def mean(self, skipna: bool = None, min_count: int = 0, dtype=np.float64):
+        return self._reduce(
+            "mean", skipna=skipna, min_count=min_count, dtype=dtype
+        )
+
+    def var(
+        self, skipna: bool = None, min_count: int = 0, dtype=np.float64, ddof=1
+    ):
+        return self._reduce(
+            "var", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
+        )
+
+    def std(
+        self, skipna: bool = None, min_count: int = 0, dtype=np.float64, ddof=1
+    ):
+        return self._reduce(
+            "std", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
+        )
+
     def median(self, skipna: bool = None) -> NumericalBaseColumn:
         skipna = True if skipna is None else skipna
 
@@ -171,7 +145,7 @@ def _numeric_quantile(
             self, quant, interpolation, sorted_indices, exact
         )
 
-    def cov(self, other: ColumnBase) -> float:
+    def cov(self, other: NumericalBaseColumn) -> float:
         if (
             len(self) == 0
             or len(other) == 0
@@ -183,7 +157,7 @@ def cov(self, other: ColumnBase) -> float:
         cov_sample = result.sum() / (len(self) - 1)
         return cov_sample
 
-    def corr(self, other: ColumnBase) -> float:
+    def corr(self, other: NumericalBaseColumn) -> float:
         if len(self) == 0 or len(other) == 0:
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
 

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
@@ -5154,7 +5154,7 @@ def to_arrow(self) -> pa.Array:
             return super().to_arrow()
 
     def sum(
-        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
+        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0,
     ):
         result_col = self._process_for_reduction(
             skipna=skipna, min_count=min_count

diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -385,20 +385,29 @@ def quantile(
         return result.astype(self.dtype)
 
     def sum(
-        self, skipna: bool = None, dtype: Dtype = None, min_count=0
+        self, skipna: bool = None, min_count: int = 0, dtype: Dtype = None,
     ) -> pd.Timedelta:
         return pd.Timedelta(
-            self.as_numerical.sum(
-                skipna=skipna, dtype=dtype, min_count=min_count
+            # Since sum isn't overriden in Numerical[Base]Column, mypy only
+            # sees the signature from Reducible (which doesn't have the extra
+            # parameters from ColumnBase._reduce) so we have to ignore this.
+            self.as_numerical.sum(  # type: ignore
+                skipna=skipna, min_count=min_count, dtype=dtype
             ),
             unit=self.time_unit,
         )
 
     def std(
-        self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64
+        self,
+        skipna: bool = None,
+        min_count: int = 0,
+        dtype: Dtype = np.float64,
+        ddof: int = 1,
     ) -> pd.Timedelta:
         return pd.Timedelta(
-            self.as_numerical.std(skipna=skipna, ddof=ddof, dtype=dtype),
+            self.as_numerical.std(
+                skipna=skipna, min_count=min_count, ddof=ddof, dtype=dtype
+            ),
             unit=self.time_unit,
         )
 

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -5410,10 +5410,13 @@ def _reduce(
         axis = self._get_axis_from_axis_arg(axis)
 
         if axis == 0:
-            result = [
-                getattr(self._data[col], op)(**kwargs)
-                for col in self._data.names
-            ]
+            try:
+                result = [
+                    getattr(self._data[col], op)(**kwargs)
+                    for col in self._data.names
+                ]
+            except AttributeError:
+                raise TypeError(f"cannot perform {op} with type {self.dtype}")
 
             return Series._from_data(
                 {None: result}, as_index(self._data.names)