Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update physionet2019 pipeline #3

Merged
merged 2 commits into from
May 28, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 62 additions & 67 deletions benchpots/preprocessing/physionet_2019.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,13 @@
from .utils import create_missingness, print_final_dataset_info


def preprocess_physionet2019(rate, pattern: str = "point", subset="all", **kwargs):
def preprocess_physionet2019(
rate,
pattern: str = "point",
subset="all",
features: list = None,
**kwargs,
):
"""Generate a fully-prepared PhysioNet2019 dataset for benchmarking and validating POTS models.

Parameters
Expand All @@ -30,6 +36,8 @@ def preprocess_physionet2019(rate, pattern: str = "point", subset="all", **kwarg

subset

features

Returns
-------
processed_dataset: dict,
Expand All @@ -38,81 +46,84 @@ def preprocess_physionet2019(rate, pattern: str = "point", subset="all", **kwarg
"""

def apply_func(df_temp): # pad and truncate to set the max length of samples as 48
missing = list(set(range(0, 48)).difference(set(df_temp["ICULOS"])))
missing_part = pd.DataFrame({"ICULOS": missing})
df_temp = pd.concat(
[df_temp, missing_part], ignore_index=False, sort=False
) # pad the sample's length to 48 if it doesn't have enough time steps
df_temp = df_temp.set_index("ICULOS").sort_index().reset_index()
df_temp = df_temp.iloc[:48] # truncate
if len(df_temp) < 48:
return None
else:
df_temp = df_temp.set_index("ICULOS").sort_index().reset_index()
df_temp = df_temp.iloc[:48] # truncate
return df_temp

all_subsets = ["all", "training_setA", "training_setB"]
assert (
subset.lower() in all_subsets
subset in all_subsets
), f"subset should be one of {all_subsets}, but got {subset}"
assert 0 <= rate < 1, f"rate must be in [0, 1), but got {rate}"

# read the raw data
data = tsdb.load("physionet_2019")
data["static_features"].remove("HospAdmTime") # keep HospAdmTime for now
all_features = set(data["training_setA"].columns)
label_feature = "SepsisLabel" # feature SepsisLabel contains labels indicating whether patients get sepsis
time_feature = "ICULOS" # ICU length-of-stay (hours since ICU admit)

if subset != "all":
df = data[subset]
X = df.reset_index(drop=True)
unique_ids = df["RecordID"].unique()
y = data[f"outcomes-{subset.split('_')[-1]}"]
y = y.loc[unique_ids]
else:
df = pd.concat([data["training_setA"], data["training_setB"]], sort=True)
X = df.reset_index(drop=True)
unique_ids = df["RecordID"].unique()
y = pd.concat([data["outcomes-setA"], data["outcomes-setB"]])
y = y.loc[unique_ids]

# remove the other static features, e.g. age, gender
X = X.drop(data["static_features"], axis=1)
if (
features is None
): # if features are not specified, we use all features except the static features, e.g. age
X = X.drop(data["static_features"], axis=1)
else: # if features are specified by users, only use the specified features
# check if the given features are valid
features_set = set(features)
if not all_features.issuperset(features_set):
intersection_feats = all_features.intersection(features_set)
difference = features_set.difference(intersection_feats)
raise ValueError(
f"Given features contain invalid features that not in the dataset: {difference}"
)
# check if the given features contain necessary features for preprocessing
if "RecordID" not in features:
features.append("RecordID")
if label_feature not in features:
features.append(label_feature)
if time_feature not in features:
features.append(time_feature)

# select the specified features finally
X = X[features]

X = X.groupby("RecordID").apply(apply_func)
X = X.drop("RecordID", axis=1)
X = X.reset_index()
HospAdmTime = X[["RecordID", "HospAdmTime"]].set_index("RecordID").dropna()
X = X.drop(["level_1", "HospAdmTime"], axis=1)
X = X.drop(["level_1"], axis=1)

# PhysioNet2019 is an imbalanced dataset, hence, we separate positive and negative samples here for later splitting
# This is to ensure positive and negative ratios are similar in train/val/test sets
# split the dataset into the train, val, and test sets
all_recordID = X["RecordID"].unique()
positive = (y == 1)["SepsisLabel"] # positive samples
positive_sample_IDs = y.loc[positive].index.to_list()
negative_sample_IDs = np.setxor1d(all_recordID, positive_sample_IDs)
assert len(positive_sample_IDs) + len(negative_sample_IDs) == len(all_recordID)
train_set_ids, test_set_ids = train_test_split(all_recordID, test_size=0.2)
train_set_ids, val_set_ids = train_test_split(train_set_ids, test_size=0.2)

# split the dataset into the train, val, and test sets
train_positive_set_ids, test_positive_set_ids = train_test_split(
positive_sample_IDs, test_size=0.2
)
train_positive_set_ids, val_positive_set_ids = train_test_split(
train_positive_set_ids, test_size=0.2
)
train_negative_set_ids, test_negative_set_ids = train_test_split(
negative_sample_IDs, test_size=0.2
)
train_negative_set_ids, val_negative_set_ids = train_test_split(
train_negative_set_ids, test_size=0.2
)
train_set_ids = np.concatenate([train_positive_set_ids, train_negative_set_ids])
val_set_ids = np.concatenate([val_positive_set_ids, val_negative_set_ids])
test_set_ids = np.concatenate([test_positive_set_ids, test_negative_set_ids])
train_set_ids.sort()
val_set_ids.sort()
test_set_ids.sort()
train_set = X[X["RecordID"].isin(train_set_ids)].sort_values(["RecordID", "ICULOS"])
val_set = X[X["RecordID"].isin(val_set_ids)].sort_values(["RecordID", "ICULOS"])
test_set = X[X["RecordID"].isin(test_set_ids)].sort_values(["RecordID", "ICULOS"])
train_set = X[X["RecordID"].isin(train_set_ids)].sort_values(
["RecordID", time_feature]
)
val_set = X[X["RecordID"].isin(val_set_ids)].sort_values(["RecordID", time_feature])
test_set = X[X["RecordID"].isin(test_set_ids)].sort_values(
["RecordID", time_feature]
)
train_y = train_set[[time_feature, label_feature]]
val_y = val_set[[time_feature, label_feature]]
test_y = test_set[[time_feature, label_feature]]

# remove useless columns and turn into numpy arrays
train_set = train_set.drop(["RecordID", "ICULOS"], axis=1)
val_set = val_set.drop(["RecordID", "ICULOS"], axis=1)
test_set = test_set.drop(["RecordID", "ICULOS"], axis=1)
train_set = train_set.drop(["RecordID", time_feature, label_feature], axis=1)
val_set = val_set.drop(["RecordID", time_feature, label_feature], axis=1)
test_set = test_set.drop(["RecordID", time_feature, label_feature], axis=1)
train_X, val_X, test_X = (
train_set.to_numpy(),
val_set.to_numpy(),
Expand All @@ -131,21 +142,8 @@ def apply_func(df_temp): # pad and truncate to set the max length of samples as
test_X = test_X.reshape(len(test_set_ids), 48, -1)

# fetch labels for train/val/test sets
train_y = y[y.index.isin(train_set_ids)].sort_index()
val_y = y[y.index.isin(val_set_ids)].sort_index()
test_y = y[y.index.isin(test_set_ids)].sort_index()
train_y, val_y, test_y = train_y.to_numpy(), val_y.to_numpy(), test_y.to_numpy()

# fetch HospAdmTime for train/val/test sets
train_HospAdmTime = HospAdmTime[HospAdmTime.index.isin(train_set_ids)].sort_index()
val_HospAdmTime = HospAdmTime[HospAdmTime.index.isin(val_set_ids)].sort_index()
test_HospAdmTime = HospAdmTime[HospAdmTime.index.isin(test_set_ids)].sort_index()
train_HospAdmTime, val_HospAdmTime, test_HospAdmTime = (
train_HospAdmTime.to_numpy(),
val_HospAdmTime.to_numpy(),
test_HospAdmTime.to_numpy(),
)

# assemble the final processed data into a dictionary
processed_dataset = {
# general info
Expand All @@ -155,16 +153,13 @@ def apply_func(df_temp): # pad and truncate to set the max length of samples as
"scaler": scaler,
# train set
"train_X": train_X,
"train_y": train_y.flatten(),
"train_HospAdmTime": train_HospAdmTime.flatten(),
"train_y": train_y,
# val set
"val_X": val_X,
"val_y": val_y.flatten(),
"val_HospAdmTime": val_HospAdmTime.flatten(),
"val_y": val_y,
# test set
"test_X": test_X,
"test_y": test_y.flatten(),
"test_HospAdmTime": test_HospAdmTime.flatten(),
"test_y": test_y,
}

if rate > 0:
Expand Down