From 08e7230606e7214924f73b21960d786b305b6927 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Tue, 28 May 2024 16:41:37 +0800 Subject: [PATCH 1/2] feat: update the preprocessing pipeline of physionet2019; --- benchpots/preprocessing/physionet_2019.py | 123 +++++++++++----------- 1 file changed, 59 insertions(+), 64 deletions(-) diff --git a/benchpots/preprocessing/physionet_2019.py b/benchpots/preprocessing/physionet_2019.py index eff3217..7654b2e 100644 --- a/benchpots/preprocessing/physionet_2019.py +++ b/benchpots/preprocessing/physionet_2019.py @@ -16,7 +16,13 @@ from .utils import create_missingness, print_final_dataset_info -def preprocess_physionet2019(rate, pattern: str = "point", subset="all", **kwargs): +def preprocess_physionet2019( + rate, + pattern: str = "point", + subset="all", + features: list = None, + **kwargs, +): """Generate a fully-prepared PhysioNet2019 dataset for benchmarking and validating POTS models. Parameters @@ -30,6 +36,8 @@ def preprocess_physionet2019(rate, pattern: str = "point", subset="all", **kwarg subset + features + Returns ------- processed_dataset: dict, @@ -38,76 +46,79 @@ def preprocess_physionet2019(rate, pattern: str = "point", subset="all", **kwarg """ def apply_func(df_temp): # pad and truncate to set the max length of samples as 48 - missing = list(set(range(0, 48)).difference(set(df_temp["ICULOS"]))) - missing_part = pd.DataFrame({"ICULOS": missing}) - df_temp = pd.concat( - [df_temp, missing_part], ignore_index=False, sort=False - ) # pad the sample's length to 48 if it doesn't have enough time steps - df_temp = df_temp.set_index("ICULOS").sort_index().reset_index() - df_temp = df_temp.iloc[:48] # truncate + if len(df_temp) < 48: + return None + else: + df_temp = df_temp.set_index("ICULOS").sort_index().reset_index() + df_temp = df_temp.iloc[:48] # truncate return df_temp all_subsets = ["all", "training_setA", "training_setB"] assert ( - subset.lower() in all_subsets + subset in all_subsets ), f"subset should be one of {all_subsets}, but got {subset}" assert 0 <= rate < 1, f"rate must be in [0, 1), but got {rate}" # read the raw data data = tsdb.load("physionet_2019") - data["static_features"].remove("HospAdmTime") # keep HospAdmTime for now + all_features = set(data["training_setA"].columns) + label_feature = "SepsisLabel" # feature SepsisLabel contains labels indicating whether patients get sepsis + time_feature = "ICULOS" # ICU length-of-stay (hours since ICU admit) if subset != "all": df = data[subset] X = df.reset_index(drop=True) - unique_ids = df["RecordID"].unique() - y = data[f"outcomes-{subset.split('_')[-1]}"] - y = y.loc[unique_ids] else: df = pd.concat([data["training_setA"], data["training_setB"]], sort=True) X = df.reset_index(drop=True) - unique_ids = df["RecordID"].unique() - y = pd.concat([data["outcomes-setA"], data["outcomes-setB"]]) - y = y.loc[unique_ids] - # remove the other static features, e.g. age, gender - X = X.drop(data["static_features"], axis=1) + if ( + features is None + ): # if features are not specified, we use all features except the static features, e.g. age + X = X.drop(data["static_features"], axis=1) + else: # if features are specified by users, only use the specified features + # check if the given features are valid + features_set = set(features) + if not all_features.issuperset(features_set): + intersection_feats = all_features.intersection(features_set) + difference = features_set.difference(intersection_feats) + raise ValueError( + f"Given features contain invalid features that not in the dataset: {difference}" + ) + # check if the given features contain necessary features for preprocessing + if "RecordID" not in features: + features.append("RecordID") + if label_feature not in features: + features.append(label_feature) + if time_feature not in features: + features.append(time_feature) + + # select the specified features finally + X = X[features] + X = X.groupby("RecordID").apply(apply_func) X = X.drop("RecordID", axis=1) X = X.reset_index() - HospAdmTime = X[["RecordID", "HospAdmTime"]].set_index("RecordID").dropna() - X = X.drop(["level_1", "HospAdmTime"], axis=1) + X = X.drop(["level_1"], axis=1) - # PhysioNet2019 is an imbalanced dataset, hence, we separate positive and negative samples here for later splitting - # This is to ensure positive and negative ratios are similar in train/val/test sets + # split the dataset into the train, val, and test sets all_recordID = X["RecordID"].unique() - positive = (y == 1)["SepsisLabel"] # positive samples - positive_sample_IDs = y.loc[positive].index.to_list() - negative_sample_IDs = np.setxor1d(all_recordID, positive_sample_IDs) - assert len(positive_sample_IDs) + len(negative_sample_IDs) == len(all_recordID) + train_set_ids, test_set_ids = train_test_split(all_recordID, test_size=0.2) + train_set_ids, val_set_ids = train_test_split(train_set_ids, test_size=0.2) - # split the dataset into the train, val, and test sets - train_positive_set_ids, test_positive_set_ids = train_test_split( - positive_sample_IDs, test_size=0.2 - ) - train_positive_set_ids, val_positive_set_ids = train_test_split( - train_positive_set_ids, test_size=0.2 - ) - train_negative_set_ids, test_negative_set_ids = train_test_split( - negative_sample_IDs, test_size=0.2 - ) - train_negative_set_ids, val_negative_set_ids = train_test_split( - train_negative_set_ids, test_size=0.2 - ) - train_set_ids = np.concatenate([train_positive_set_ids, train_negative_set_ids]) - val_set_ids = np.concatenate([val_positive_set_ids, val_negative_set_ids]) - test_set_ids = np.concatenate([test_positive_set_ids, test_negative_set_ids]) train_set_ids.sort() val_set_ids.sort() test_set_ids.sort() - train_set = X[X["RecordID"].isin(train_set_ids)].sort_values(["RecordID", "ICULOS"]) - val_set = X[X["RecordID"].isin(val_set_ids)].sort_values(["RecordID", "ICULOS"]) - test_set = X[X["RecordID"].isin(test_set_ids)].sort_values(["RecordID", "ICULOS"]) + train_set = X[X["RecordID"].isin(train_set_ids)].sort_values( + ["RecordID", time_feature] + ) + val_set = X[X["RecordID"].isin(val_set_ids)].sort_values(["RecordID", time_feature]) + test_set = X[X["RecordID"].isin(test_set_ids)].sort_values( + ["RecordID", time_feature] + ) + train_y = train_set[[time_feature, label_feature]] + val_y = val_set[[time_feature, label_feature]] + test_y = test_set[[time_feature, label_feature]] # remove useless columns and turn into numpy arrays train_set = train_set.drop(["RecordID", "ICULOS"], axis=1) @@ -131,21 +142,8 @@ def apply_func(df_temp): # pad and truncate to set the max length of samples as test_X = test_X.reshape(len(test_set_ids), 48, -1) # fetch labels for train/val/test sets - train_y = y[y.index.isin(train_set_ids)].sort_index() - val_y = y[y.index.isin(val_set_ids)].sort_index() - test_y = y[y.index.isin(test_set_ids)].sort_index() train_y, val_y, test_y = train_y.to_numpy(), val_y.to_numpy(), test_y.to_numpy() - # fetch HospAdmTime for train/val/test sets - train_HospAdmTime = HospAdmTime[HospAdmTime.index.isin(train_set_ids)].sort_index() - val_HospAdmTime = HospAdmTime[HospAdmTime.index.isin(val_set_ids)].sort_index() - test_HospAdmTime = HospAdmTime[HospAdmTime.index.isin(test_set_ids)].sort_index() - train_HospAdmTime, val_HospAdmTime, test_HospAdmTime = ( - train_HospAdmTime.to_numpy(), - val_HospAdmTime.to_numpy(), - test_HospAdmTime.to_numpy(), - ) - # assemble the final processed data into a dictionary processed_dataset = { # general info @@ -155,16 +153,13 @@ def apply_func(df_temp): # pad and truncate to set the max length of samples as "scaler": scaler, # train set "train_X": train_X, - "train_y": train_y.flatten(), - "train_HospAdmTime": train_HospAdmTime.flatten(), + "train_y": train_y, # val set "val_X": val_X, - "val_y": val_y.flatten(), - "val_HospAdmTime": val_HospAdmTime.flatten(), + "val_y": val_y, # test set "test_X": test_X, - "test_y": test_y.flatten(), - "test_HospAdmTime": test_HospAdmTime.flatten(), + "test_y": test_y, } if rate > 0: From 0e56d87e0c6dc75f27365a359d11a7acc8eb6ec4 Mon Sep 17 00:00:00 2001 From: Wenjie Du Date: Tue, 28 May 2024 22:43:05 +0800 Subject: [PATCH 2/2] feat: remove useless features in X; --- benchpots/preprocessing/physionet_2019.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchpots/preprocessing/physionet_2019.py b/benchpots/preprocessing/physionet_2019.py index 7654b2e..835e4b1 100644 --- a/benchpots/preprocessing/physionet_2019.py +++ b/benchpots/preprocessing/physionet_2019.py @@ -121,9 +121,9 @@ def apply_func(df_temp): # pad and truncate to set the max length of samples as test_y = test_set[[time_feature, label_feature]] # remove useless columns and turn into numpy arrays - train_set = train_set.drop(["RecordID", "ICULOS"], axis=1) - val_set = val_set.drop(["RecordID", "ICULOS"], axis=1) - test_set = test_set.drop(["RecordID", "ICULOS"], axis=1) + train_set = train_set.drop(["RecordID", time_feature, label_feature], axis=1) + val_set = val_set.drop(["RecordID", time_feature, label_feature], axis=1) + test_set = test_set.drop(["RecordID", time_feature, label_feature], axis=1) train_X, val_X, test_X = ( train_set.to_numpy(), val_set.to_numpy(),