Skip to content

Commit

Permalink
Implement a mixin for reductions (#9925)
Browse files Browse the repository at this point in the history
This PR implements a factory for mixins classes based on the common pattern in cuDF of categories of similar functions all calling a common method implementing some standard pre/post-processing before calling a lower-level API of either one of its members (e.g. `Frames` calling `Column` methods) or the C++ libcudf library. When added to another class, these mixins support customization of which methods are exposed via a class member set of method names. Documentation for these methods is generated by formatting the docstring for the common internal method, e.g. `_reduce` for reductions. As a first pass, this PR generates a single mixin for reductions and applies it to all the relevant classes. Future PRs will use this to generate classes for scans, binary operations, and unary operations, and perhaps other similar categories as they are uncovered.

This approach assumes a great deal of API homogeneity between the different methods in a category. `Frame` violates this assumption because similar operations often support slightly different parameters (for instance, some reductions support a `min_count` parameter), so for now `Frame` was not made `Reducible`. That decision could be revisited if 1) the degree of homogeneity of these function signatures increases over time, or 2) we can introduce greater customization into these mixins without adding too much complexity. A first attempt of (2) can be seen in [this branch](https://github.com/vyasr/cudf/tree/refactor/reductions_extended), but the degree of additional complexity just to support `Frame` isn't really justifiable at this stage, so unless we can come up with a simpler solution I recommend leaving `Frame` as is for now.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Michael Wang (https://github.com/isVoid)
  - Ashwin Srinath (https://github.com/shwina)

URL: #9925
  • Loading branch information
vyasr authored Feb 25, 2022
1 parent e0af727 commit 21325e8
Show file tree
Hide file tree
Showing 15 changed files with 562 additions and 199 deletions.
78 changes: 25 additions & 53 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
ListDtype,
StructDtype,
)
from cudf.core.mixins import Reducible
from cudf.utils import utils
from cudf.utils.dtypes import (
cudf_dtype_from_pa_type,
Expand All @@ -86,7 +87,14 @@
Slice = TypeVar("Slice", bound=slice)


class ColumnBase(Column, Serializable, NotIterable):
class ColumnBase(Column, Serializable, Reducible, NotIterable):
_VALID_REDUCTIONS = {
"any",
"all",
"max",
"min",
}

def as_frame(self) -> "cudf.core.frame.Frame":
"""
Converts a Column to Frame
Expand Down Expand Up @@ -674,16 +682,10 @@ def append(self, other: ColumnBase) -> ColumnBase:
return concat_columns([self, as_column(other)])

def quantile(
self,
q: Union[float, Sequence[float]],
interpolation: builtins.str,
exact: bool,
self, q: Union[float, Sequence[float]], interpolation: str, exact: bool
) -> ColumnBase:
raise TypeError(f"cannot perform quantile with type {self.dtype}")

def median(self, skipna: bool = None) -> ScalarLike:
raise TypeError(f"cannot perform median with type {self.dtype}")

def take(
self: T, indices: ColumnBase, nullify: bool = False, check_bounds=True
) -> T:
Expand Down Expand Up @@ -1162,53 +1164,23 @@ def _minmax(self, skipna: bool = None):
return libcudf.reduce.minmax(result_col)
return result_col

def min(self, skipna: bool = None, dtype: Dtype = None):
result_col = self._process_for_reduction(skipna=skipna)
if isinstance(result_col, ColumnBase):
return libcudf.reduce.reduce("min", result_col, dtype=dtype)
return result_col

def max(self, skipna: bool = None, dtype: Dtype = None):
result_col = self._process_for_reduction(skipna=skipna)
if isinstance(result_col, ColumnBase):
return libcudf.reduce.reduce("max", result_col, dtype=dtype)
return result_col

def sum(
self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
):
raise TypeError(f"cannot perform sum with type {self.dtype}")

def product(
self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
):
raise TypeError(f"cannot perform product with type {self.dtype}")

def mean(self, skipna: bool = None, dtype: Dtype = None):
raise TypeError(f"cannot perform mean with type {self.dtype}")

def std(self, skipna: bool = None, ddof=1, dtype: Dtype = np.float64):
raise TypeError(f"cannot perform std with type {self.dtype}")

def var(self, skipna: bool = None, ddof=1, dtype: Dtype = np.float64):
raise TypeError(f"cannot perform var with type {self.dtype}")

def kurtosis(self, skipna: bool = None):
raise TypeError(f"cannot perform kurtosis with type {self.dtype}")
def _reduce(
self, op: str, skipna: bool = None, min_count: int = 0, *args, **kwargs
) -> ScalarLike:
"""Compute {op} of column values.
def skew(self, skipna: bool = None):
raise TypeError(f"cannot perform skew with type {self.dtype}")

def cov(self, other: ColumnBase):
raise TypeError(
f"cannot perform covarience with types {self.dtype}, "
f"{other.dtype}"
)

def corr(self, other: ColumnBase):
raise TypeError(
f"cannot perform corr with types {self.dtype}, {other.dtype}"
skipna : bool
Whether or not na values must be skipped.
min_count : int, default 0
The minimum number of entries for the reduction, otherwise the
reduction returns NaN.
"""
preprocessed = self._process_for_reduction(
skipna=skipna, min_count=min_count
)
if isinstance(preprocessed, ColumnBase):
return libcudf.reduce.reduce(op, preprocessed, **kwargs)
return preprocessed

@property
def contains_na_entries(self) -> bool:
Expand Down
20 changes: 15 additions & 5 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2021, NVIDIA CORPORATION.
# Copyright (c) 2019-2022, NVIDIA CORPORATION.

from __future__ import annotations

Expand Down Expand Up @@ -346,17 +346,27 @@ def as_string_column(
column.column_empty(0, dtype="object", masked=False),
)

def mean(self, skipna=None, dtype=np.float64) -> ScalarLike:
def mean(
self, skipna=None, min_count: int = 0, dtype=np.float64
) -> ScalarLike:
return pd.Timestamp(
self.as_numerical.mean(skipna=skipna, dtype=dtype),
self.as_numerical.mean(
skipna=skipna, min_count=min_count, dtype=dtype
),
unit=self.time_unit,
)

def std(
self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64
self,
skipna: bool = None,
min_count: int = 0,
dtype: Dtype = np.float64,
ddof: int = 1,
) -> pd.Timedelta:
return pd.Timedelta(
self.as_numerical.std(skipna=skipna, ddof=ddof, dtype=dtype)
self.as_numerical.std(
skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
)
* _numpy_to_pandas_conversion[self.time_unit],
)

Expand Down
88 changes: 31 additions & 57 deletions python/cudf/cudf/core/column/numerical_base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2021, NVIDIA CORPORATION.
# Copyright (c) 2018-2022, NVIDIA CORPORATION.
"""Define an interface for columns that can perform numerical operations."""

from __future__ import annotations
Expand All @@ -10,7 +10,7 @@

import cudf
from cudf import _lib as libcudf
from cudf._typing import Dtype, ScalarLike
from cudf._typing import ScalarLike
from cudf.core.column import ColumnBase


Expand All @@ -23,59 +23,14 @@ class NumericalBaseColumn(ColumnBase):
point, should be encoded here.
"""

def reduce(
self, op: str, skipna: bool = None, min_count: int = 0, **kwargs
) -> ScalarLike:
"""Perform a reduction operation.
op : str
The operation to perform.
skipna : bool
Whether or not na values must be
"""
preprocessed = self._process_for_reduction(
skipna=skipna, min_count=min_count
)
if isinstance(preprocessed, ColumnBase):
return libcudf.reduce.reduce(op, preprocessed, **kwargs)
else:
return preprocessed

def sum(
self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
) -> ScalarLike:
return self.reduce(
"sum", skipna=skipna, dtype=dtype, min_count=min_count
)

def product(
self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
) -> ScalarLike:
return self.reduce(
"product", skipna=skipna, dtype=dtype, min_count=min_count
)

def mean(
self, skipna: bool = None, dtype: Dtype = np.float64
) -> ScalarLike:
return self.reduce("mean", skipna=skipna, dtype=dtype)

def var(
self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64
) -> ScalarLike:
return self.reduce("var", skipna=skipna, dtype=dtype, ddof=ddof)

def std(
self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64
) -> ScalarLike:
return self.reduce("std", skipna=skipna, dtype=dtype, ddof=ddof)

def sum_of_squares(
self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
) -> ScalarLike:
return self.reduce(
"sum_of_squares", skipna=skipna, dtype=dtype, min_count=min_count
)
_VALID_REDUCTIONS = {
"sum",
"product",
"sum_of_squares",
"mean",
"var",
"std",
}

def _can_return_nan(self, skipna: bool = None) -> bool:
return not skipna and self.has_nulls()
Expand Down Expand Up @@ -148,6 +103,25 @@ def quantile(
)
return result

def mean(self, skipna: bool = None, min_count: int = 0, dtype=np.float64):
return self._reduce(
"mean", skipna=skipna, min_count=min_count, dtype=dtype
)

def var(
self, skipna: bool = None, min_count: int = 0, dtype=np.float64, ddof=1
):
return self._reduce(
"var", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
)

def std(
self, skipna: bool = None, min_count: int = 0, dtype=np.float64, ddof=1
):
return self._reduce(
"std", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
)

def median(self, skipna: bool = None) -> NumericalBaseColumn:
skipna = True if skipna is None else skipna

Expand All @@ -171,7 +145,7 @@ def _numeric_quantile(
self, quant, interpolation, sorted_indices, exact
)

def cov(self, other: ColumnBase) -> float:
def cov(self, other: NumericalBaseColumn) -> float:
if (
len(self) == 0
or len(other) == 0
Expand All @@ -183,7 +157,7 @@ def cov(self, other: ColumnBase) -> float:
cov_sample = result.sum() / (len(self) - 1)
return cov_sample

def corr(self, other: ColumnBase) -> float:
def corr(self, other: NumericalBaseColumn) -> float:
if len(self) == 0 or len(other) == 0:
return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)

Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -5154,7 +5154,7 @@ def to_arrow(self) -> pa.Array:
return super().to_arrow()

def sum(
self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0,
):
result_col = self._process_for_reduction(
skipna=skipna, min_count=min_count
Expand Down
21 changes: 15 additions & 6 deletions python/cudf/cudf/core/column/timedelta.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

from __future__ import annotations

Expand Down Expand Up @@ -385,20 +385,29 @@ def quantile(
return result.astype(self.dtype)

def sum(
self, skipna: bool = None, dtype: Dtype = None, min_count=0
self, skipna: bool = None, min_count: int = 0, dtype: Dtype = None,
) -> pd.Timedelta:
return pd.Timedelta(
self.as_numerical.sum(
skipna=skipna, dtype=dtype, min_count=min_count
# Since sum isn't overriden in Numerical[Base]Column, mypy only
# sees the signature from Reducible (which doesn't have the extra
# parameters from ColumnBase._reduce) so we have to ignore this.
self.as_numerical.sum( # type: ignore
skipna=skipna, min_count=min_count, dtype=dtype
),
unit=self.time_unit,
)

def std(
self, skipna: bool = None, ddof: int = 1, dtype: Dtype = np.float64
self,
skipna: bool = None,
min_count: int = 0,
dtype: Dtype = np.float64,
ddof: int = 1,
) -> pd.Timedelta:
return pd.Timedelta(
self.as_numerical.std(skipna=skipna, ddof=ddof, dtype=dtype),
self.as_numerical.std(
skipna=skipna, min_count=min_count, ddof=ddof, dtype=dtype
),
unit=self.time_unit,
)

Expand Down
11 changes: 7 additions & 4 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5410,10 +5410,13 @@ def _reduce(
axis = self._get_axis_from_axis_arg(axis)

if axis == 0:
result = [
getattr(self._data[col], op)(**kwargs)
for col in self._data.names
]
try:
result = [
getattr(self._data[col], op)(**kwargs)
for col in self._data.names
]
except AttributeError:
raise TypeError(f"cannot perform {op} with type {self.dtype}")

return Series._from_data(
{None: result}, as_index(self._data.names)
Expand Down
Loading

0 comments on commit 21325e8

Please sign in to comment.