Skip to content

Commit

Permalink
API: Unify .update to generic
Browse files Browse the repository at this point in the history
  • Loading branch information
h-vetinari committed Oct 16, 2018
1 parent 913f71f commit c6e9dfb
Show file tree
Hide file tree
Showing 4 changed files with 193 additions and 146 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ Other Enhancements
- :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`).
- :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`).
- New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`).
- :meth:`Series.update` now supports the same keywords and functionality as :meth:`DataFrame.update` (:issue:`22358`)
- Compatibility with Matplotlib 3.0 (:issue:`22790`).

.. _whatsnew_0240.api_breaking:
Expand Down
136 changes: 0 additions & 136 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5198,142 +5198,6 @@ def combiner(x, y, needs_i8_conversion=False):

return self.combine(other, combiner, overwrite=False)

def update(self, other, join='left', overwrite=True, filter_func=None,
raise_conflict=False):
"""
Modify in place using non-NA values from another DataFrame.
Aligns on indices. There is no return value.
Parameters
----------
other : DataFrame, or object coercible into a DataFrame
Should have at least one matching index/column label
with the original DataFrame. If a Series is passed,
its name attribute must be set, and that will be
used as the column name to align with the original DataFrame.
join : {'left'}, default 'left'
Only left join is implemented, keeping the index and columns of the
original object.
overwrite : bool, default True
How to handle non-NA values for overlapping keys:
* True: overwrite original DataFrame's values
with values from `other`.
* False: only update values that are NA in
the original DataFrame.
filter_func : callable(1d-array) -> boolean 1d-array, optional
Can choose to replace values other than NA. Return True for values
that should be updated.
raise_conflict : bool, default False
If True, will raise a ValueError if the DataFrame and `other`
both contain non-NA data in the same place.
Raises
------
ValueError
When `raise_conflict` is True and there's overlapping non-NA data.
See Also
--------
dict.update : Similar method for dictionaries.
DataFrame.merge : For column(s)-on-columns(s) operations.
Examples
--------
>>> df = pd.DataFrame({'A': [1, 2, 3],
... 'B': [400, 500, 600]})
>>> new_df = pd.DataFrame({'B': [4, 5, 6],
... 'C': [7, 8, 9]})
>>> df.update(new_df)
>>> df
A B
0 1 4
1 2 5
2 3 6
The DataFrame's length does not increase as a result of the update,
only values at matching index/column labels are updated.
>>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
... 'B': ['x', 'y', 'z']})
>>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']})
>>> df.update(new_df)
>>> df
A B
0 a d
1 b e
2 c f
For Series, it's name attribute must be set.
>>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
... 'B': ['x', 'y', 'z']})
>>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2])
>>> df.update(new_column)
>>> df
A B
0 a d
1 b y
2 c e
>>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
... 'B': ['x', 'y', 'z']})
>>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2])
>>> df.update(new_df)
>>> df
A B
0 a x
1 b d
2 c e
If `other` contains NaNs the corresponding values are not updated
in the original dataframe.
>>> df = pd.DataFrame({'A': [1, 2, 3],
... 'B': [400, 500, 600]})
>>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})
>>> df.update(new_df)
>>> df
A B
0 1 4.0
1 2 500.0
2 3 6.0
"""
import pandas.core.computation.expressions as expressions
# TODO: Support other joins
if join != 'left': # pragma: no cover
raise NotImplementedError("Only left join is supported")

if not isinstance(other, DataFrame):
other = DataFrame(other)

other = other.reindex_like(self)

for col in self.columns:
this = self[col].values
that = other[col].values
if filter_func is not None:
with np.errstate(all='ignore'):
mask = ~filter_func(this) | isna(that)
else:
if raise_conflict:
mask_this = notna(that)
mask_that = notna(this)
if any(mask_this & mask_that):
raise ValueError("Data overlaps.")

if overwrite:
mask = isna(that)
else:
mask = notna(this)

# don't overwrite columns unecessarily
if mask.all():
continue

self[col] = expressions.where(mask, this, that)

# ----------------------------------------------------------------------
# Data reshaping

Expand Down
164 changes: 164 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,32 @@ def _single_replace(self, to_replace, method, inplace, limit):
return result


def _update_column(this, that, overwrite=True, filter_func=None,
raise_conflict=False):
import pandas.core.computation.expressions as expressions

if filter_func is not None:
with np.errstate(all='ignore'):
mask = ~filter_func(this) | isna(that)
else:
if raise_conflict:
mask_this = notna(that)
mask_that = notna(this)
if any(mask_this & mask_that):
raise ValueError("Data overlaps.")

if overwrite:
mask = isna(that)
else:
mask = notna(this)

# don't overwrite columns unnecessarily
if mask.all():
return None

return expressions.where(mask, this, that)


class NDFrame(PandasObject, SelectionMixin):
"""
N-dimensional analogue of DataFrame. Store multi-dimensional in a
Expand Down Expand Up @@ -4079,6 +4105,144 @@ def _reindex_axis(self, new_index, fill_method, axis, copy):
else:
return self._constructor(new_data).__finalize__(self)

def update(self, other, join='left', overwrite=True, filter_func=None,
raise_conflict=False):
"""
Modify in place using non-NA values from another DataFrame.
Aligns on indices. There is no return value.
Parameters
----------
other : DataFrame, or object coercible into a DataFrame
Should have at least one matching index/column label
with the original DataFrame. If a Series is passed,
its name attribute must be set, and that will be
used as the column name to align with the original DataFrame.
join : {'left'}, default 'left'
Only left join is implemented, keeping the index and columns of the
original object.
overwrite : bool, default True
How to handle non-NA values for overlapping keys:
* True: overwrite original DataFrame's values
with values from `other`.
* False: only update values that are NA in
the original DataFrame.
filter_func : callable(1d-array) -> boolean 1d-array, optional
Can choose to replace values other than NA. Return True for values
that should be updated.
raise_conflict : bool, default False
If True, will raise a ValueError if the DataFrame and `other`
both contain non-NA data in the same place.
Raises
------
ValueError
When `raise_conflict` is True and there's overlapping non-NA data.
See Also
--------
dict.update : Similar method for dictionaries.
DataFrame.merge : For column(s)-on-columns(s) operations.
Examples
--------
>>> df = pd.DataFrame({'A': [1, 2, 3],
... 'B': [400, 500, 600]})
>>> new_df = pd.DataFrame({'B': [4, 5, 6],
... 'C': [7, 8, 9]})
>>> df.update(new_df)
>>> df
A B
0 1 4
1 2 5
2 3 6
The DataFrame's length does not increase as a result of the update,
only values at matching index/column labels are updated.
>>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
... 'B': ['x', 'y', 'z']})
>>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']})
>>> df.update(new_df)
>>> df
A B
0 a d
1 b e
2 c f
For Series, it's name attribute must be set.
>>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
... 'B': ['x', 'y', 'z']})
>>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2])
>>> df.update(new_column)
>>> df
A B
0 a d
1 b y
2 c e
>>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
... 'B': ['x', 'y', 'z']})
>>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2])
>>> df.update(new_df)
>>> df
A B
0 a x
1 b d
2 c e
If `other` contains NaNs the corresponding values are not updated
in the original dataframe.
>>> df = pd.DataFrame({'A': [1, 2, 3],
... 'B': [400, 500, 600]})
>>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})
>>> df.update(new_df)
>>> df
A B
0 1 4.0
1 2 500.0
2 3 6.0
"""
from pandas import Series, DataFrame
# TODO: Support other joins
if join != 'left': # pragma: no cover
raise NotImplementedError("Only left join is supported")

if isinstance(self, ABCSeries):
if not isinstance(other, ABCSeries):
other = Series(other)
other = other.reindex_like(self)
this = self.values
that = other.values
updated = _update_column(this, that, overwrite=overwrite,
filter_func=filter_func,
raise_conflict=raise_conflict)
if updated is None:
# don't overwrite Series unnecessarily
return
self._data._block.values = updated
else: # DataFrame
if not isinstance(other, ABCDataFrame):
other = DataFrame(other)

other = other.reindex_like(self)

for col in self.columns:
this = self[col].values
that = other[col].values

updated = _update_column(this, that, overwrite=overwrite,
filter_func=filter_func,
raise_conflict=raise_conflict)
# don't overwrite columns unnecessarily
if updated is None:
continue
self[col] = updated

def filter(self, items=None, like=None, regex=None, axis=None):
"""
Subset rows or columns of dataframe according to labels in
Expand Down
38 changes: 28 additions & 10 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2377,14 +2377,35 @@ def combine_first(self, other):

return this.where(notna(this), other)

def update(self, other):
def update(self, other, join='left', overwrite=True, filter_func=None,
raise_conflict=False):
"""
Modify Series in place using non-NA values from passed
Series. Aligns on index
Modify Series in place using non-NA values from passed Series.
Aligns on index.
Parameters
----------
other : Series
other : Series, or object coercible into a Series
Should have at least one matching index label with the calling
Series.
join : {'left'}, default 'left'
Only left join is implemented, keeping the index and columns of the
original object.
overwrite : bool, default True
How to handle non-NA values for overlapping keys:
* True: overwrite original DataFrame's values
with values from `other`.
* False: only update values that are NA in
the original DataFrame.
filter_func : callable(1d-array) -> boolean 1d-array, optional
Can choose to replace values other than NA. Return True for values
that should be updated.
raise_conflict : bool, default False
If True, will raise a ValueError if the DataFrame and `other`
both contain non-NA data in the same place.
Examples
--------
Expand Down Expand Up @@ -2422,13 +2443,10 @@ def update(self, other):
1 2
2 6
dtype: int64
"""
other = other.reindex_like(self)
mask = notna(other)

self._data = self._data.putmask(mask=mask, new=other, inplace=True)
self._maybe_update_cacher()
super(Series, self).update(other, join=join, overwrite=overwrite,
filter_func=filter_func,
raise_conflict=raise_conflict)

# ----------------------------------------------------------------------
# Reindexing, sorting
Expand Down

0 comments on commit c6e9dfb

Please sign in to comment.