Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF: vectorize _interp_limit #16592

Merged
merged 3 commits into from
Jun 4, 2017
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 74 additions & 10 deletions pandas/core/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,12 +143,6 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None,
'DatetimeIndex')
method = 'values'

def _interp_limit(invalid, fw_limit, bw_limit):
"Get idx of values that won't be filled b/c they exceed the limits."
for x in np.where(invalid)[0]:
if invalid[max(0, x - fw_limit):x + bw_limit + 1].all():
yield x

valid_limit_directions = ['forward', 'backward', 'both']
limit_direction = limit_direction.lower()
if limit_direction not in valid_limit_directions:
Expand Down Expand Up @@ -180,21 +174,29 @@ def _interp_limit(invalid, fw_limit, bw_limit):

# default limit is unlimited GH #16282
if limit is None:
limit = len(xvalues)
# limit = len(xvalues)
pass
elif not is_integer(limit):
raise ValueError('Limit must be an integer')
elif limit < 1:
raise ValueError('Limit must be greater than 0')

# each possible limit_direction
if limit_direction == 'forward':
# TODO: do we need sorted?
if limit_direction == 'forward' and limit is not None:
violate_limit = sorted(start_nans |
set(_interp_limit(invalid, limit, 0)))
elif limit_direction == 'backward':
elif limit_direction == 'forward':
violate_limit = sorted(start_nans)
elif limit_direction == 'backward' and limit is not None:
violate_limit = sorted(end_nans |
set(_interp_limit(invalid, 0, limit)))
elif limit_direction == 'both':
elif limit_direction == 'backward':
violate_limit = sorted(end_nans)
elif limit_direction == 'both' and limit is not None:
violate_limit = sorted(_interp_limit(invalid, limit, limit))
else:
violate_limit = []

xvalues = getattr(xvalues, 'values', xvalues)
yvalues = getattr(yvalues, 'values', yvalues)
Expand Down Expand Up @@ -630,3 +632,65 @@ def fill_zeros(result, x, y, name, fill):
result = result.reshape(shape)

return result


def _interp_limit0(invalid, fw_limit, bw_limit):
"Get idx of values that won't be filled b/c they exceed the limits."
for x in np.where(invalid)[0]:
if invalid[max(0, x - fw_limit):x + bw_limit + 1].all():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should rewrite this as well

yield x


def _interp_limit(invalid, fw_limit, bw_limit):
"""Get idx of values that won't be filled b/c they exceed the limits.

This is equivalent to the more readable, but slower

.. code-block:: python

for x in np.where(invalid)[0]:
if invalid[max(0, x - fw_limit):x + bw_limit + 1].all():
yield x
"""
# handle forward first; the backward direction is the same except
# 1. operate on the reversed array
# 2. subtract the returned indicies from N - 1
N = len(invalid)

def inner(invalid, limit):
limit = min(limit, N)
windowed = rolling_window(invalid, limit + 1).all(1)
idx = (set(np.where(windowed)[0] + limit) |
set(np.where((~invalid[:limit + 1]).cumsum() == 0)[0]))
return idx

if fw_limit == 0:
f_idx = set(np.where(invalid)[0])
else:
f_idx = inner(invalid, fw_limit)

if bw_limit == 0:
# then we don't even need to care about backwards, just use forwards
return f_idx
else:
b_idx = set(N - 1 - np.asarray(list(inner(invalid[::-1], bw_limit))))
if fw_limit == 0:
return b_idx
return f_idx & b_idx


def rolling_window(a, window):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe move this inside _interp_limit to avoid confusion that this is public and/or used externally

"""
[True, True, False, True, False], 2 ->

[
[True, True],
[True, False],
[False, True],
[True, False],
]
"""
# https://stackoverflow.com/a/6811241
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
strides = a.strides + (a.strides[-1],)
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)