Skip to content

Commit

Permalink
PERF: improved clip performance (pandas-dev#16364)
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback authored and stangirala committed Jun 11, 2017
1 parent 843e2e7 commit 4c6b1c9
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 2 deletions.
11 changes: 11 additions & 0 deletions asv_bench/benchmarks/series_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ def setup(self):
def time_series_dropna_int64(self):
self.s.dropna()


class series_dropna_datetime(object):
goal_time = 0.2

Expand All @@ -120,3 +121,13 @@ def setup(self):

def time_series_dropna_datetime(self):
self.s.dropna()


class series_clip(object):
goal_time = 0.2

def setup(self):
self.s = pd.Series(np.random.randn(50))

def time_series_dropna_datetime(self):
self.s.clip(0, 1)
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.20.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ Highlights include:
Enhancements
~~~~~~~~~~~~

- Unblocked access to additional compression types supported in pytables: 'blosc:blosclz, 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd' (:issue:`14478`)
- Unblocked access to additional compression types supported in pytables: 'blosc:blosclz, 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd' (:issue:`14478`)

.. _whatsnew_0202.performance:

Expand All @@ -28,6 +28,7 @@ Performance Improvements

- Performance regression fix when indexing with a list-like (:issue:`16285`)
- Performance regression fix for small MultiIndexes (:issuse:`16319`)
- Improved performance of ``.clip()`` with scalar arguments (:issue:`15400`)

.. _whatsnew_0202.bug_fixes:

Expand Down
33 changes: 32 additions & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
_ensure_int64,
needs_i8_conversion,
is_scalar,
is_number,
is_integer, is_bool,
is_bool_dtype,
is_numeric_dtype,
Expand Down Expand Up @@ -4104,6 +4105,22 @@ def isnull(self):
def notnull(self):
return notnull(self).__finalize__(self)

def _clip_with_scalar(self, lower, upper):

if ((lower is not None and np.any(isnull(lower))) or
(upper is not None and np.any(isnull(upper)))):
raise ValueError("Cannot use an NA value as a clip threshold")

result = self.values
mask = isnull(result)
if upper is not None:
result = np.where(result >= upper, upper, result)
if lower is not None:
result = np.where(result <= lower, lower, result)
result[mask] = np.nan
return self._constructor(
result, **self._construct_axes_dict()).__finalize__(self)

def clip(self, lower=None, upper=None, axis=None, *args, **kwargs):
"""
Trim values at input threshold(s).
Expand All @@ -4122,26 +4139,29 @@ def clip(self, lower=None, upper=None, axis=None, *args, **kwargs):
Examples
--------
>>> df
0 1
0 1
0 0.335232 -1.256177
1 -1.367855 0.746646
2 0.027753 -1.176076
3 0.230930 -0.679613
4 1.261967 0.570967
>>> df.clip(-1.0, 0.5)
0 1
0 0.335232 -1.000000
1 -1.000000 0.500000
2 0.027753 -1.000000
3 0.230930 -0.679613
4 0.500000 0.500000
>>> t
0 -0.3
1 -0.2
2 -0.1
3 0.0
4 0.1
dtype: float64
>>> df.clip(t, t + 1, axis=0)
0 1
0 0.335232 -0.300000
Expand All @@ -4160,6 +4180,11 @@ def clip(self, lower=None, upper=None, axis=None, *args, **kwargs):
if is_scalar(lower) and is_scalar(upper):
lower, upper = min(lower, upper), max(lower, upper)

# fast-path for scalars
if ((lower is None or (is_scalar(lower) and is_number(lower))) and
(upper is None or (is_scalar(upper) and is_number(upper)))):
return self._clip_with_scalar(lower, upper)

result = self
if lower is not None:
result = result.clip_lower(lower, axis)
Expand Down Expand Up @@ -4189,6 +4214,9 @@ def clip_upper(self, threshold, axis=None):
if np.any(isnull(threshold)):
raise ValueError("Cannot use an NA value as a clip threshold")

if is_scalar(threshold) and is_number(threshold):
return self._clip_with_scalar(None, threshold)

subset = self.le(threshold, axis=axis) | isnull(self)
return self.where(subset, threshold, axis=axis)

Expand All @@ -4213,6 +4241,9 @@ def clip_lower(self, threshold, axis=None):
if np.any(isnull(threshold)):
raise ValueError("Cannot use an NA value as a clip threshold")

if is_scalar(threshold) and is_number(threshold):
return self._clip_with_scalar(threshold, None)

subset = self.ge(threshold, axis=axis) | isnull(self)
return self.where(subset, threshold, axis=axis)

Expand Down
1 change: 1 addition & 0 deletions pandas/tests/series/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1011,6 +1011,7 @@ def test_clip_against_series(self):

lower = Series([1.0, 2.0, 3.0])
upper = Series([1.5, 2.5, 3.5])

assert_series_equal(s.clip(lower, upper), Series([1.0, 2.0, 3.5]))
assert_series_equal(s.clip(1.5, upper), Series([1.5, 1.5, 3.5]))

Expand Down

0 comments on commit 4c6b1c9

Please sign in to comment.