From c69da84cfd4e1caf924797b18693129e74c20fbf Mon Sep 17 00:00:00 2001 From: Noam Hershtig Date: Sat, 26 Jan 2019 21:00:38 +0200 Subject: [PATCH] Refactor groupby helper from tempita to fused types --- pandas/_libs/groupby.pyx | 49 ++++++++++++++++++++++++++++++ pandas/_libs/groupby_helper.pxi.in | 49 +----------------------------- pandas/core/groupby/ops.py | 2 +- 3 files changed, 51 insertions(+), 49 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index e6036654c71c33..a49995883756b2 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -2,6 +2,7 @@ import cython from cython import Py_ssize_t +from cython cimport floating from libc.stdlib cimport malloc, free @@ -382,5 +383,53 @@ def group_any_all(uint8_t[:] out, out[lab] = flag_val +@cython.wraparound(False) +@cython.boundscheck(False) +def group_add(floating[:, :] out, + int64_t[:] counts, + floating[:, :] values, + const int64_t[:] labels, + Py_ssize_t min_count=0): + """ + Only aggregates on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, lab, ncounts = len(counts) + floating val, count + ndarray[floating, ndim=2] sumx, nobs + + if not len(values) == len(labels): + raise AssertionError("len(index) != len(labels)") + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = (values).shape + + with nogil: + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + + for i in range(ncounts): + for j in range(K): + if nobs[i, j] < min_count: + out[i, j] = NAN + else: + out[i, j] = sumx[i, j] + + + # generated from template include "groupby_helper.pxi" diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 858039f038d024..db7018e1a72544 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -9,7 +9,7 @@ cdef extern from "numpy/npy_math.h": _int64_max = np.iinfo(np.int64).max # ---------------------------------------------------------------------- -# group_add, group_prod, group_var, group_mean, group_ohlc +# group_prod, group_var, group_mean, group_ohlc # ---------------------------------------------------------------------- {{py: @@ -27,53 +27,6 @@ def get_dispatch(dtypes): {{for name, c_type in get_dispatch(dtypes)}} -@cython.wraparound(False) -@cython.boundscheck(False) -def group_add_{{name}}({{c_type}}[:, :] out, - int64_t[:] counts, - {{c_type}}[:, :] values, - const int64_t[:] labels, - Py_ssize_t min_count=0): - """ - Only aggregates on axis=0 - """ - cdef: - Py_ssize_t i, j, N, K, lab, ncounts = len(counts) - {{c_type}} val, count - ndarray[{{c_type}}, ndim=2] sumx, nobs - - if not len(values) == len(labels): - raise AssertionError("len(index) != len(labels)") - - nobs = np.zeros_like(out) - sumx = np.zeros_like(out) - - N, K = (values).shape - - with nogil: - - for i in range(N): - lab = labels[i] - if lab < 0: - continue - - counts[lab] += 1 - for j in range(K): - val = values[i, j] - - # not nan - if val == val: - nobs[lab, j] += 1 - sumx[lab, j] += val - - for i in range(ncounts): - for j in range(K): - if nobs[i, j] < min_count: - out[i, j] = NAN - else: - out[i, j] = sumx[i, j] - - @cython.wraparound(False) @cython.boundscheck(False) def group_prod_{{name}}({{c_type}}[:, :] out, diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 87f48d5a405540..78c9aa91871351 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -380,7 +380,7 @@ def get_func(fname): # otherwise find dtype-specific version, falling back to object for dt in [dtype_str, 'object']: f = getattr(libgroupby, "{fname}_{dtype_str}".format( - fname=fname, dtype_str=dtype_str), None) + fname=fname, dtype_str=dt), None) if f is not None: return f