From c13f38717572597a57dde29f73738790e0dc9c79 Mon Sep 17 00:00:00 2001 From: ajpotts Date: Mon, 1 Jul 2024 16:44:20 -0400 Subject: [PATCH] Closes #3326 DataFrame.assign (#3327) Co-authored-by: Amanda Potts --- PROTO_tests/tests/dataframe_test.py | 27 +++++++++ arkouda/dataframe.py | 88 +++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+) diff --git a/PROTO_tests/tests/dataframe_test.py b/PROTO_tests/tests/dataframe_test.py index 5333213452..2a04a5e62d 100644 --- a/PROTO_tests/tests/dataframe_test.py +++ b/PROTO_tests/tests/dataframe_test.py @@ -1381,6 +1381,33 @@ def get_tail_values(col): df.to_pandas(retain_index=True).groupby("a").tail(n=2), ) + def test_assign(self): + ak_df = ak.DataFrame( + {"temp_c": ak.array([17.0, 25.0])}, index=ak.array(["Portland", "Berkeley"]) + ) + pd_df = ak_df.to_pandas() + + assert_frame_equal( + ak_df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32).to_pandas(), + pd_df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32), + ) + + assert_frame_equal( + ak_df.assign(temp_f=ak_df["temp_c"] * 9 / 5 + 32).to_pandas(), + pd_df.assign(temp_f=pd_df["temp_c"] * 9 / 5 + 32), + ) + + assert_frame_equal( + ak_df.assign( + temp_f=lambda x: x["temp_c"] * 9 / 5 + 32, + temp_k=lambda x: (x["temp_f"] + 459.67) * 5 / 9, + ).to_pandas(), + pd_df.assign( + temp_f=lambda x: x["temp_c"] * 9 / 5 + 32, + temp_k=lambda x: (x["temp_f"] + 459.67) * 5 / 9, + ), + ) + def pda_to_str_helper(pda): return ak.array([f"str {i}" for i in pda.to_list()]) diff --git a/arkouda/dataframe.py b/arkouda/dataframe.py index 6d5a451165..5539d037b1 100644 --- a/arkouda/dataframe.py +++ b/arkouda/dataframe.py @@ -54,6 +54,23 @@ ] +def apply_if_callable(maybe_callable, obj, **kwargs): + """ + Evaluate possibly callable input using obj and kwargs if it is callable, + otherwise return as it is. + + Parameters + ---------- + maybe_callable : possibly a callable + obj : NDFrame + **kwargs + """ + if callable(maybe_callable): + return maybe_callable(obj, **kwargs) + + return maybe_callable + + def groupby_operators(cls): for name in GROUPBY_REDUCTION_TYPES: setattr(cls, name, cls._make_aggop(name)) @@ -1073,6 +1090,9 @@ def __setitem__(self, key, value): # Set a single column in the dataframe using a an arkouda array elif isinstance(key, str): + if isinstance(value, Series): + value = value.values + if not isinstance(value, self._COLUMN_CLASSES): raise ValueError(f"Column must be one of {self._COLUMN_CLASSES}.") elif self._nrows is not None and self._nrows != value.size: @@ -5495,6 +5515,74 @@ def from_return_msg(cls, rep_msg): return cls(columns, idx) + def assign(self, **kwargs) -> DataFrame: + r""" + Assign new columns to a DataFrame. + + Returns a new object with all original columns in addition to new ones. + Existing columns that are re-assigned will be overwritten. + + Parameters + ---------- + **kwargs : dict of {str: callable or Series} + The column names are keywords. If the values are + callable, they are computed on the DataFrame and + assigned to the new columns. The callable must not + change input DataFrame (though pandas doesn't check it). + If the values are not callable, (e.g. a Series, scalar, or array), + they are simply assigned. + + Returns + ------- + DataFrame + A new DataFrame with the new columns in addition to + all the existing columns. + + Notes + ----- + Assigning multiple columns within the same ``assign`` is possible. + Later items in '\*\*kwargs' may refer to newly created or modified + columns in 'df'; items are computed and assigned into 'df' in order. + + Examples + -------- + >>> df = ak.DataFrame({'temp_c': [17.0, 25.0]}, + ... index=['Portland', 'Berkeley']) + >>> df + temp_c + Portland 17.0 + Berkeley 25.0 + + Where the value is a callable, evaluated on `df`: + + >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32) + temp_c temp_f + Portland 17.0 62.6 + Berkeley 25.0 77.0 + + Alternatively, the same behavior can be achieved by directly + referencing an existing Series or sequence: + + >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32) + temp_c temp_f + Portland 17.0 62.6 + Berkeley 25.0 77.0 + + You can create multiple columns within the same assign where one + of the columns depends on another one defined within the same assign: + + >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32, + ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9) + temp_c temp_f temp_k + Portland 17.0 62.6 290.15 + Berkeley 25.0 77.0 298.15 + """ + data = self.copy(deep=None) + + for k, v in kwargs.items(): + data[k] = apply_if_callable(v, data) + return data + def intx(a, b): """