From d3a6b299b280b834cf538e4a1f9a3b12ab197bc3 Mon Sep 17 00:00:00 2001 From: Amanda Potts Date: Thu, 13 Jun 2024 17:53:31 -0400 Subject: [PATCH] Closes #3326 DataFrame.assign --- PROTO_tests/tests/dataframe_test.py | 23 ++++++++ arkouda/dataframe.py | 85 +++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) diff --git a/PROTO_tests/tests/dataframe_test.py b/PROTO_tests/tests/dataframe_test.py index 2ee38c060d..16d01b8e1f 100644 --- a/PROTO_tests/tests/dataframe_test.py +++ b/PROTO_tests/tests/dataframe_test.py @@ -1310,6 +1310,29 @@ def test_sample_flags(self): print(f"Failure with seed:\n{seed}") assert res + def test_assign(self): + ak_df = ak.DataFrame({"temp_c": ak.array([17.0, 25.0])}, index=ak.array(["Portland", "Berkeley"])) + pd_df = ak_df.to_pandas() + assert_frame_equal( + ak_df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32).to_pandas(), + pd_df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32), + ) + assert_frame_equal( + ak_df.assign(temp_f=ak_df["temp_c"] * 9 / 5 + 32).to_pandas(), + pd_df.assign(temp_f=pd_df["temp_c"] * 9 / 5 + 32), + ) + + assert_frame_equal( + ak_df.assign( + temp_f=lambda x: x["temp_c"] * 9 / 5 + 32, + temp_k=lambda x: (x["temp_f"] + 459.67) * 5 / 9, + ).to_pandas(), + pd_df.assign( + temp_f=lambda x: x["temp_c"] * 9 / 5 + 32, + temp_k=lambda x: (x["temp_f"] + 459.67) * 5 / 9, + ), + ) + def pda_to_str_helper(pda): return ak.array([f"str {i}" for i in pda.to_list()]) diff --git a/arkouda/dataframe.py b/arkouda/dataframe.py index 52e366d024..6f54a78ffd 100644 --- a/arkouda/dataframe.py +++ b/arkouda/dataframe.py @@ -53,6 +53,23 @@ ] +def apply_if_callable(maybe_callable, obj, **kwargs): + """ + Evaluate possibly callable input using obj and kwargs if it is callable, + otherwise return as it is. + + Parameters + ---------- + maybe_callable : possibly a callable + obj : NDFrame + **kwargs + """ + if callable(maybe_callable): + return maybe_callable(obj, **kwargs) + + return maybe_callable + + def groupby_operators(cls): for name in GROUPBY_REDUCTION_TYPES: setattr(cls, name, cls._make_aggop(name)) @@ -5343,6 +5360,74 @@ def from_return_msg(cls, rep_msg): return cls(columns, idx) + def assign(self, **kwargs) -> DataFrame: + r""" + Assign new columns to a DataFrame. + + Returns a new object with all original columns in addition to new ones. + Existing columns that are re-assigned will be overwritten. + + Parameters + ---------- + **kwargs : dict of {str: callable or Series} + The column names are keywords. If the values are + callable, they are computed on the DataFrame and + assigned to the new columns. The callable must not + change input DataFrame (though pandas doesn't check it). + If the values are not callable, (e.g. a Series, scalar, or array), + they are simply assigned. + + Returns + ------- + DataFrame + A new DataFrame with the new columns in addition to + all the existing columns. + + Notes + ----- + Assigning multiple columns within the same ``assign`` is possible. + Later items in '\*\*kwargs' may refer to newly created or modified + columns in 'df'; items are computed and assigned into 'df' in order. + + Examples + -------- + >>> df = ak.DataFrame({'temp_c': [17.0, 25.0]}, + ... index=['Portland', 'Berkeley']) + >>> df + temp_c + Portland 17.0 + Berkeley 25.0 + + Where the value is a callable, evaluated on `df`: + + >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32) + temp_c temp_f + Portland 17.0 62.6 + Berkeley 25.0 77.0 + + Alternatively, the same behavior can be achieved by directly + referencing an existing Series or sequence: + + >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32) + temp_c temp_f + Portland 17.0 62.6 + Berkeley 25.0 77.0 + + You can create multiple columns within the same assign where one + of the columns depends on another one defined within the same assign: + + >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32, + ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9) + temp_c temp_f temp_k + Portland 17.0 62.6 290.15 + Berkeley 25.0 77.0 298.15 + """ + data = self.copy(deep=None) + + for k, v in kwargs.items(): + data[k] = apply_if_callable(v, data) + return data + def intx(a, b): """