Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cythonized GroupBy Fill #19673

Merged
merged 25 commits into from
Feb 25, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
c58123c
Added test case for groupby fill methods
WillAyd Feb 12, 2018
2bc8023
Added code for group_fillna
WillAyd Feb 13, 2018
3cb25c0
Added ASV benchmarks
WillAyd Feb 13, 2018
7fecc11
Connected GroupBy method to Cython fillna
WillAyd Feb 13, 2018
3c2fb36
Fixed issue when filling Series after GroupBy
WillAyd Feb 13, 2018
a52b8c4
Added tests to mix group entries; fixed sort bug
WillAyd Feb 13, 2018
16c1823
Simplied groupby Cython calls for ffill/bfill
WillAyd Feb 13, 2018
bd3d5e0
Removed abandoned Cython implementation
WillAyd Feb 13, 2018
cae65af
Added upcast to int64 to prevent 32 bit failures
WillAyd Feb 13, 2018
0266514
Fixed issue with reconstructing grouped Series
WillAyd Feb 14, 2018
50dc690
Changed .view to .astype to avoid 32 bit segfaults
WillAyd Feb 15, 2018
9fa8e25
Added whatsnew
WillAyd Feb 15, 2018
5da06d8
Aligned group_fillna and group_shift signatures
WillAyd Feb 19, 2018
2fe91a4
Fixed failing test; list comp for _fill method
WillAyd Feb 19, 2018
825ba17
Updated whatsnew
WillAyd Feb 19, 2018
127c71c
PEP8 fixes
WillAyd Feb 19, 2018
3a23cd6
Py27 support with super call
WillAyd Feb 20, 2018
a363146
Fixed LINT issue
WillAyd Feb 20, 2018
fd513c8
Used kwargs to call Cython groupby funcs
WillAyd Feb 20, 2018
776d1b7
Docstring for _fill method
WillAyd Feb 20, 2018
33f0d06
Cleaned up kwargs passing to Cython layer
WillAyd Feb 21, 2018
662008a
Idiomatic update - replace join with concat
WillAyd Feb 22, 2018
27e24fa
Moved non-templated funcs to groupby.pyx
WillAyd Feb 23, 2018
6f72476
Code update - swap group_index.take with grouper
WillAyd Feb 23, 2018
eff6603
Rebase and update import
WillAyd Feb 24, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,11 +370,11 @@ class GroupByMethods(object):

param_names = ['dtype', 'method']
params = [['int', 'float'],
['all', 'any', 'count', 'cumcount', 'cummax', 'cummin',
'cumprod', 'cumsum', 'describe', 'first', 'head', 'last', 'mad',
'max', 'min', 'median', 'mean', 'nunique', 'pct_change', 'prod',
'rank', 'sem', 'shift', 'size', 'skew', 'std', 'sum', 'tail',
'unique', 'value_counts', 'var']]
['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin',
'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head',
'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique',
'pct_change', 'prod', 'rank', 'sem', 'shift', 'size', 'skew',
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we bench for the compat methods for datetimelikes? (timestamp / timedelta), most of these would work (except prod, mean, mad, pct_change and a few more), though these likely work for timedeltas. ok with adding an issue to do this as well (IOW doesn't need to be in this PR).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See #19733

'std', 'sum', 'tail', 'unique', 'value_counts', 'var']]

def setup(self, dtype, method):
ngroups = 1000
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -689,6 +689,7 @@ Performance Improvements
- Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`)
- Improved performance of :func:`DataFrameGroupBy.rank` (:issue:`15779`)
- Improved performance of variable ``.rolling()`` on ``.min()`` and ``.max()`` (:issue:`19521`)
- Improved performance of ``GroupBy.ffill`` and ``GroupBy.bfill`` (:issue:`11296`)

.. _whatsnew_0230.docs:

Expand Down
216 changes: 216 additions & 0 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -94,5 +94,221 @@ cdef inline float64_t kth_smallest_c(float64_t* a,
return a[k]


@cython.boundscheck(False)
@cython.wraparound(False)
def group_median_float64(ndarray[float64_t, ndim=2] out,
ndarray[int64_t] counts,
ndarray[float64_t, ndim=2] values,
ndarray[int64_t] labels,
Py_ssize_t min_count=-1):
"""
Only aggregates on axis=0
"""
cdef:
Py_ssize_t i, j, N, K, ngroups, size
ndarray[int64_t] _counts
ndarray data
float64_t* ptr

assert min_count == -1, "'min_count' only used in add and prod"

ngroups = len(counts)
N, K = (<object> values).shape

indexer, _counts = groupsort_indexer(labels, ngroups)
counts[:] = _counts[1:]

data = np.empty((K, N), dtype=np.float64)
ptr = <float64_t*> data.data

take_2d_axis1_float64_float64(values.T, indexer, out=data)

with nogil:

for i in range(K):
# exclude NA group
ptr += _counts[0]
for j in range(ngroups):
size = _counts[j + 1]
out[j, i] = median_linear(ptr, size)
ptr += size


@cython.boundscheck(False)
@cython.wraparound(False)
def group_cumprod_float64(float64_t[:, :] out,
float64_t[:, :] values,
int64_t[:] labels,
bint is_datetimelike):
"""
Only transforms on axis=0
"""
cdef:
Py_ssize_t i, j, N, K, size
float64_t val
float64_t[:, :] accum
int64_t lab

N, K = (<object> values).shape
accum = np.ones_like(values)

with nogil:
for i in range(N):
lab = labels[i]

if lab < 0:
continue
for j in range(K):
val = values[i, j]
if val == val:
accum[lab, j] *= val
out[i, j] = accum[lab, j]


@cython.boundscheck(False)
@cython.wraparound(False)
def group_cumsum(numeric[:, :] out,
numeric[:, :] values,
int64_t[:] labels,
is_datetimelike):
"""
Only transforms on axis=0
"""
cdef:
Py_ssize_t i, j, N, K, size
numeric val
numeric[:, :] accum
int64_t lab

N, K = (<object> values).shape
accum = np.zeros_like(values)

with nogil:
for i in range(N):
lab = labels[i]

if lab < 0:
continue
for j in range(K):
val = values[i, j]

if numeric == float32_t or numeric == float64_t:
if val == val:
accum[lab, j] += val
out[i, j] = accum[lab, j]
else:
accum[lab, j] += val
out[i, j] = accum[lab, j]


@cython.boundscheck(False)
@cython.wraparound(False)
def group_shift_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,
int ngroups, int periods):
cdef:
Py_ssize_t N, i, j, ii
int offset, sign
int64_t lab, idxer, idxer_slot
int64_t[:] label_seen = np.zeros(ngroups, dtype=np.int64)
int64_t[:, :] label_indexer

N, = (<object> labels).shape

if periods < 0:
periods = -periods
offset = N - 1
sign = -1
elif periods > 0:
offset = 0
sign = 1

if periods == 0:
with nogil:
for i in range(N):
out[i] = i
else:
# array of each previous indexer seen
label_indexer = np.zeros((ngroups, periods), dtype=np.int64)
with nogil:
for i in range(N):
## reverse iterator if shifting backwards
ii = offset + sign * i
lab = labels[ii]

# Skip null keys
if lab == -1:
out[ii] = -1
continue

label_seen[lab] += 1

idxer_slot = label_seen[lab] % periods
idxer = label_indexer[lab, idxer_slot]

if label_seen[lab] > periods:
out[ii] = idxer
else:
out[ii] = -1

label_indexer[lab, idxer_slot] = ii


@cython.wraparound(False)
@cython.boundscheck(False)
def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,
ndarray[uint8_t] mask, object direction,
int64_t limit):
"""Indexes how to fill values forwards or backwards within a group

Parameters
----------
out : array of int64_t values which this method will write its results to
Missing values will be written to with a value of -1
labels : array containing unique label for each group, with its ordering
matching up to the corresponding record in `values`
mask : array of int64_t values where a 1 indicates a missing value
direction : {'ffill', 'bfill'}
Direction for fill to be applied (forwards or backwards, respectively)
limit : Consecutive values to fill before stopping, or -1 for no limit

Notes
-----
This method modifies the `out` parameter rather than returning an object
"""
cdef:
Py_ssize_t i, N
ndarray[int64_t] sorted_labels
int64_t idx, curr_fill_idx=-1, filled_vals=0

N = len(out)

# Make sure all arrays are the same size
assert N == len(labels) == len(mask)

sorted_labels = np.argsort(labels).astype(np.int64, copy=False)
if direction == 'bfill':
sorted_labels = sorted_labels[::-1]

with nogil:
for i in range(N):
idx = sorted_labels[i]
if mask[idx] == 1: # is missing
# Stop filling once we've hit the limit
if filled_vals >= limit and limit != -1:
curr_fill_idx = -1
filled_vals += 1
else: # reset items when not missing
filled_vals = 0
curr_fill_idx = idx

out[idx] = curr_fill_idx

# If we move to the next group, reset
# the fill_idx and counter
if i == N - 1 or labels[idx] != labels[sorted_labels[i+1]]:
curr_fill_idx = -1
filled_vals = 0


# generated from template
include "groupby_helper.pxi"
Loading