WenjieDu · WenjieDu · May 28, 2024 · May 28, 2024 · May 28, 2024
diff --git a/benchpots/preprocessing/physionet_2019.py b/benchpots/preprocessing/physionet_2019.py
@@ -16,7 +16,13 @@
 from .utils import create_missingness, print_final_dataset_info
 
 
-def preprocess_physionet2019(rate, pattern: str = "point", subset="all", **kwargs):
+def preprocess_physionet2019(
+    rate,
+    pattern: str = "point",
+    subset="all",
+    features: list = None,
+    **kwargs,
+):
     """Generate a fully-prepared PhysioNet2019 dataset for benchmarking and validating POTS models.
 
     Parameters
@@ -30,6 +36,8 @@ def preprocess_physionet2019(rate, pattern: str = "point", subset="all", **kwarg
 
     subset
 
+    features
+
     Returns
     -------
     processed_dataset: dict,
@@ -38,81 +46,84 @@ def preprocess_physionet2019(rate, pattern: str = "point", subset="all", **kwarg
     """
 
     def apply_func(df_temp):  # pad and truncate to set the max length of samples as 48
-        missing = list(set(range(0, 48)).difference(set(df_temp["ICULOS"])))
-        missing_part = pd.DataFrame({"ICULOS": missing})
-        df_temp = pd.concat(
-            [df_temp, missing_part], ignore_index=False, sort=False
-        )  # pad the sample's length to 48 if it doesn't have enough time steps
-        df_temp = df_temp.set_index("ICULOS").sort_index().reset_index()
-        df_temp = df_temp.iloc[:48]  # truncate
+        if len(df_temp) < 48:
+            return None
+        else:
+            df_temp = df_temp.set_index("ICULOS").sort_index().reset_index()
+            df_temp = df_temp.iloc[:48]  # truncate
         return df_temp
 
     all_subsets = ["all", "training_setA", "training_setB"]
     assert (
-        subset.lower() in all_subsets
+        subset in all_subsets
     ), f"subset should be one of {all_subsets}, but got {subset}"
     assert 0 <= rate < 1, f"rate must be in [0, 1), but got {rate}"
 
     # read the raw data
     data = tsdb.load("physionet_2019")
-    data["static_features"].remove("HospAdmTime")  # keep HospAdmTime for now
+    all_features = set(data["training_setA"].columns)
+    label_feature = "SepsisLabel"  # feature SepsisLabel contains labels indicating whether patients get sepsis
+    time_feature = "ICULOS"  # ICU length-of-stay (hours since ICU admit)
 
     if subset != "all":
         df = data[subset]
         X = df.reset_index(drop=True)
-        unique_ids = df["RecordID"].unique()
-        y = data[f"outcomes-{subset.split('_')[-1]}"]
-        y = y.loc[unique_ids]
     else:
         df = pd.concat([data["training_setA"], data["training_setB"]], sort=True)
         X = df.reset_index(drop=True)
-        unique_ids = df["RecordID"].unique()
-        y = pd.concat([data["outcomes-setA"], data["outcomes-setB"]])
-        y = y.loc[unique_ids]
 
-    # remove the other static features, e.g. age, gender
-    X = X.drop(data["static_features"], axis=1)
+    if (
+        features is None
+    ):  # if features are not specified, we use all features except the static features, e.g. age
+        X = X.drop(data["static_features"], axis=1)
+    else:  # if features are specified by users, only use the specified features
+        # check if the given features are valid
+        features_set = set(features)
+        if not all_features.issuperset(features_set):
+            intersection_feats = all_features.intersection(features_set)
+            difference = features_set.difference(intersection_feats)
+            raise ValueError(
+                f"Given features contain invalid features that not in the dataset: {difference}"
+            )
+        # check if the given features contain necessary features for preprocessing
+        if "RecordID" not in features:
+            features.append("RecordID")
+        if label_feature not in features:
+            features.append(label_feature)
+        if time_feature not in features:
+            features.append(time_feature)
+
+        # select the specified features finally
+        X = X[features]
+
     X = X.groupby("RecordID").apply(apply_func)
     X = X.drop("RecordID", axis=1)
     X = X.reset_index()
-    HospAdmTime = X[["RecordID", "HospAdmTime"]].set_index("RecordID").dropna()
-    X = X.drop(["level_1", "HospAdmTime"], axis=1)
+    X = X.drop(["level_1"], axis=1)
 
-    # PhysioNet2019 is an imbalanced dataset, hence, we separate positive and negative samples here for later splitting
-    # This is to ensure positive and negative ratios are similar in train/val/test sets
+    # split the dataset into the train, val, and test sets
     all_recordID = X["RecordID"].unique()
-    positive = (y == 1)["SepsisLabel"] # positive samples
-    positive_sample_IDs = y.loc[positive].index.to_list()
-    negative_sample_IDs = np.setxor1d(all_recordID, positive_sample_IDs)
-    assert len(positive_sample_IDs) + len(negative_sample_IDs) == len(all_recordID)
+    train_set_ids, test_set_ids = train_test_split(all_recordID, test_size=0.2)
+    train_set_ids, val_set_ids = train_test_split(train_set_ids, test_size=0.2)
 
-    # split the dataset into the train, val, and test sets
-    train_positive_set_ids, test_positive_set_ids = train_test_split(
-        positive_sample_IDs, test_size=0.2
-    )
-    train_positive_set_ids, val_positive_set_ids = train_test_split(
-        train_positive_set_ids, test_size=0.2
-    )
-    train_negative_set_ids, test_negative_set_ids = train_test_split(
-        negative_sample_IDs, test_size=0.2
-    )
-    train_negative_set_ids, val_negative_set_ids = train_test_split(
-        train_negative_set_ids, test_size=0.2
-    )
-    train_set_ids = np.concatenate([train_positive_set_ids, train_negative_set_ids])
-    val_set_ids = np.concatenate([val_positive_set_ids, val_negative_set_ids])
-    test_set_ids = np.concatenate([test_positive_set_ids, test_negative_set_ids])
     train_set_ids.sort()
     val_set_ids.sort()
     test_set_ids.sort()
-    train_set = X[X["RecordID"].isin(train_set_ids)].sort_values(["RecordID", "ICULOS"])
-    val_set = X[X["RecordID"].isin(val_set_ids)].sort_values(["RecordID", "ICULOS"])
-    test_set = X[X["RecordID"].isin(test_set_ids)].sort_values(["RecordID", "ICULOS"])
+    train_set = X[X["RecordID"].isin(train_set_ids)].sort_values(
+        ["RecordID", time_feature]
+    )
+    val_set = X[X["RecordID"].isin(val_set_ids)].sort_values(["RecordID", time_feature])
+    test_set = X[X["RecordID"].isin(test_set_ids)].sort_values(
+        ["RecordID", time_feature]
+    )
+    train_y = train_set[[time_feature, label_feature]]
+    val_y = val_set[[time_feature, label_feature]]
+    test_y = test_set[[time_feature, label_feature]]
 
     # remove useless columns and turn into numpy arrays
-    train_set = train_set.drop(["RecordID", "ICULOS"], axis=1)
-    val_set = val_set.drop(["RecordID", "ICULOS"], axis=1)
-    test_set = test_set.drop(["RecordID", "ICULOS"], axis=1)
+    train_set = train_set.drop(["RecordID", time_feature, label_feature], axis=1)
+    val_set = val_set.drop(["RecordID", time_feature, label_feature], axis=1)
+    test_set = test_set.drop(["RecordID", time_feature, label_feature], axis=1)
     train_X, val_X, test_X = (
         train_set.to_numpy(),
         val_set.to_numpy(),
@@ -131,21 +142,8 @@ def apply_func(df_temp):  # pad and truncate to set the max length of samples as
     test_X = test_X.reshape(len(test_set_ids), 48, -1)
 
     # fetch labels for train/val/test sets
-    train_y = y[y.index.isin(train_set_ids)].sort_index()
-    val_y = y[y.index.isin(val_set_ids)].sort_index()
-    test_y = y[y.index.isin(test_set_ids)].sort_index()
     train_y, val_y, test_y = train_y.to_numpy(), val_y.to_numpy(), test_y.to_numpy()
 
-    # fetch HospAdmTime for train/val/test sets
-    train_HospAdmTime = HospAdmTime[HospAdmTime.index.isin(train_set_ids)].sort_index()
-    val_HospAdmTime = HospAdmTime[HospAdmTime.index.isin(val_set_ids)].sort_index()
-    test_HospAdmTime = HospAdmTime[HospAdmTime.index.isin(test_set_ids)].sort_index()
-    train_HospAdmTime, val_HospAdmTime, test_HospAdmTime = (
-        train_HospAdmTime.to_numpy(),
-        val_HospAdmTime.to_numpy(),
-        test_HospAdmTime.to_numpy(),
-    )
-
     # assemble the final processed data into a dictionary
     processed_dataset = {
         # general info
@@ -155,16 +153,13 @@ def apply_func(df_temp):  # pad and truncate to set the max length of samples as
         "scaler": scaler,
         # train set
         "train_X": train_X,
-        "train_y": train_y.flatten(),
-        "train_HospAdmTime": train_HospAdmTime.flatten(),
+        "train_y": train_y,
         # val set
         "val_X": val_X,
-        "val_y": val_y.flatten(),
-        "val_HospAdmTime": val_HospAdmTime.flatten(),
+        "val_y": val_y,
         # test set
         "test_X": test_X,
-        "test_y": test_y.flatten(),
-        "test_HospAdmTime": test_HospAdmTime.flatten(),
+        "test_y": test_y,
     }
 
     if rate > 0: